def testCheckpointRestart(self): x = Checkpoint(TestCheckpoint.TEST_DIR) x.createCheckpointLog(TestCheckpoint.TEST_KEY) x.createCheckpointLog(TestCheckpoint.TEST_KEY2) x.writeCheckpoint(TestCheckpoint.TEST_KEY, "a", 1) x.writeCheckpoint(TestCheckpoint.TEST_KEY, "a", 2) x.writeCheckpoint(TestCheckpoint.TEST_KEY2, "b", 3) x.releaseCheckpointLog(TestCheckpoint.TEST_KEY) x.releaseCheckpointLog(TestCheckpoint.TEST_KEY2) y = Checkpoint(TestCheckpoint.TEST_DIR) self.assertEqual(y.getCheckpoints("b"), [3]) self.assertEqual(y.getCheckpoints("a"), [1, 2])
def __init__(self, option, model, train_loader, val_loader, test_loader, optimizer, criterion): self.option = option self.model = model self.train_loader = train_loader self.val_loader = val_loader self.test_loader = test_loader self.optimizer = optimizer self.criterion = criterion self.epoch_loss_plotter = tnt.logger.VisdomPlotLogger('line', opts={ 'title': 'Epoch Loss', 'xlabel': "Epochs", 'ylabel': "Loss" }) self.batch_loss_plotter = IncrementVisdomLineLogger(opts={ 'title': 'Batch Loss', 'xlabel': "Batch", 'ylabel': "Loss" }) self.checkpoint = Checkpoint(option) self.best_top1 = 0 self.start_epoch = 0 self._load_checkpoint()
def do_checkpoint(self): # when make checkpoint, first write workq and workq_buf into checkpoint file, then make a copy of workq_db if it exists for k in self.wfd_cache.keys(): os.close(self.wfd_cache[k]) # clear the cache self.wfd_cache.clear() tmp_file = self.checkpoint_file + ".part" with open(tmp_file, "wb") as f: self.circle.workq.extend(self.circle.workq_buf) self.circle.workq_buf.clear() cobj = Checkpoint(self.src, self.dest, self.get_workq(), self.totalsize) pickle.dump(cobj, f, pickle.HIGHEST_PROTOCOL) # POSIX requires rename to be atomic os.rename(tmp_file, self.checkpoint_file) # copy workq_db database file if hasattr(self.circle, "workq_db") and len(self.circle.workq_db) > 0: self.checkpoint_db = self.checkpoint_file + ".db" if not G.resume: shutil.copy2(self.circle.dbname, self.checkpoint_db) else: # in resume mode, make a copy of current workq db file, which is provided checkpoint db file self.workdir = os.getcwd() existingCheckpoint = os.path.join(self.workdir,".pcp_workq.%s.%s.db" % (G.rid, self.circle.rank)) shutil.copy2(existingCheckpoint,self.checkpoint_db)
def main(): train_loader, test_loader = get_mnist_data('../%s' % opt.dataset, opt.batch_size) model = CapsuleNetwork(opt) if opt.cuda == True: model = model.cuda() if opt.is_train == True: if opt.resume == True: latest_checkpoint_path = Checkpoint.get_latest_checkpoint( opt.save_folder) resume_checkpoint = Checkpoint.load(latest_checkpoint_path) model = resume_checkpoint.model optimizer = resume_checkpoint.optimizer start_epoch = resume_checkpoint.epoch + 1 else: start_epoch = 0 optimizer = Adam(model.parameters()) for epoch in range(start_epoch, opt.n_epochs): train(epoch, model, train_loader, test_loader, optimizer) Checkpoint(model=model, optimizer=optimizer, epoch=epoch).save(opt.save_folder) else: run_test(model, test_loader)
def testCheckLogCreation(self): x = Checkpoint(TestCheckpoint.TEST_DIR) x.createCheckpointLog(TestCheckpoint.TEST_KEY) self.assertTrue(TestCheckpoint.TEST_KEY in x.getCheckpointLogKeys()) x.releaseCheckpointLog(TestCheckpoint.TEST_KEY) self.assertFalse(TestCheckpoint.TEST_KEY in x.getCheckpointLogKeys())
def init_checkpoints(self): self.checkpoint_group.add(Checkpoint("goomba_set_1", 1200)) self.checkpoint_group.add(Checkpoint("goomba_set_2", 2600)) self.checkpoint_group.add(Checkpoint("goomba_set_3", 3400)) self.checkpoint_group.add(Checkpoint("goomba_set_4", 4100)) self.checkpoint_group.add(Checkpoint("goomba_set_5", 4610)) self.checkpoint_group.add(Checkpoint("goomba_set_6", 4720)) self.checkpoint_group.add(Checkpoint("goomba_set_7", 6210)) self.checkpoint_group.add(Checkpoint("koopa_set_1", 3830))
def __init__(self, option, model, train_loader, val_loader, test_loader, optimizer, criterion, client_loaders, sybil_loaders, iidness=[.0, .0]): self.option = option self.model = model self.train_loader = train_loader self.val_loader = val_loader self.test_loader = test_loader self.optimizer = optimizer self.criterion = criterion self.iidness = iidness self.epoch_loss_plotter = tnt.logger.VisdomPlotLogger('line', opts={ 'title': 'Epoch Loss', 'xlabel': "Epochs", 'ylabel': "Loss" }) self.batch_loss_plotter = IncrementVisdomLineLogger(opts={ 'title': 'Batch Loss', 'xlabel': "Batch", 'ylabel': "Loss" }) self.train_confusion_plotter = tnt.logger.VisdomLogger( 'heatmap', opts={ 'title': 'Train Confusion matrix', 'columnnames': list(range(option.n_classes)), 'rownames': list(range(option.n_classes)) }) self.val_confusion_plotter = tnt.logger.VisdomLogger( 'heatmap', opts={ 'title': 'Val Confusion matrix', 'columnnames': list(range(option.n_classes)), 'rownames': list(range(option.n_classes)) }) self.memory = None self.wv_history = [] self.client_loaders = client_loaders self.sybil_loaders = sybil_loaders self.checkpoint = Checkpoint(option) self.best_top1 = 0 self.start_epoch = 0 self._load_checkpoint()
def main(): if (len(sys.argv) != 5): print "Usage: " + sys.argv[ 0] + " <directory> <album name> <app key> <app secret>" sys.exit(1) dir = sys.argv[1] checkpoint = Checkpoint(dir) photos = get_photos(dir) upload_photos(sys.argv[2], photos, sys.argv[3], sys.argv[4], checkpoint)
def testCheckpointWrite(self): x = Checkpoint(TestCheckpoint.TEST_DIR) x.createCheckpointLog(TestCheckpoint.TEST_KEY) x.writeCheckpoint(TestCheckpoint.TEST_KEY, "a", 1) x.writeCheckpoint(TestCheckpoint.TEST_KEY, "a", 2) x.writeCheckpoint(TestCheckpoint.TEST_KEY, "b", 3) x.releaseCheckpointLog(TestCheckpoint.TEST_KEY) self.assertEqual(sorted(x.getCheckpointKeys()), ["a", "b"]) self.assertEqual(x.getCheckpoints("b"), [3]) self.assertEqual(x.getCheckpoints("a"), [1, 2])
def train(args): configure(args['log_dir']) dial_data = get_dataloader( os.path.join(args['data_dir'], 'encoded_train_dialogue_pair.json'), os.path.join(args['data_dir'], 'vocabulary.json'), args['batch_size']) vocab = Vocabulary() vocab.load_vocab(os.path.join(args['data_dir'], 'vocabulary.json')) args['voca_size'] = len(vocab.word2idx) model = Seq2Seq(args).cuda() if torch.cuda.is_available() else Seq2Seq( args) optimizer = torch.optim.Adam(model.parameters(), lr=args['lr']) criterion = nn.NLLLoss(ignore_index=vocab.get_idx('PADED')) min_valid_loss = float('inf') for epoch in range(args['epoches']): for batch_idx, (sour, sour_len, targ, targ_len) in enumerate(dial_data): if torch.cuda.is_available(): sour = sour.cuda() targ = targ.cuda() loss = train_batch(model, optimizer, criterion, (sour, sour_len, targ, targ_len)) logger.info('training loss:{}'.format(loss)) log_value('CrossEntropy loss', loss, epoch * len(dial_data) + batch_idx) if (batch_idx + epoch * len(dial_data)) % args['valid_step'] == 0: valid_loader = get_dataloader( os.path.join(args['data_dir'], 'encoded_valid_dialogue_pair.json'), os.path.join(args['data_dir'], 'vocabulary.json'), args['batch_size']) valid_loss = validate(model, valid_loader, criterion) log_value( 'valid loss', valid_loss, int((batch_idx + epoch * len(dial_data)) / args['valid_step'])) logger.info('valid_step:{} valid_loss:{}'.format( int((batch_idx + epoch * len(dial_data)) / args['valid_step']), valid_loss)) checkpoint = Checkpoint(model, optimizer, epoch, batch_idx) checkpoint.save(args['exp_dir'])
def main(): # create target output dir if it doesn't exist yet if not os.path.isdir(args.output_dir): os.mkdir(args.output_dir) # enable mixed-precision computation if desired if args.amp: mixed_precision.enable_mixed_precision() # set the RNG seeds (probably more hidden elsewhere...) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) # get the dataset dataset = get_dataset(args.dataset) enc_size = get_encoder_size(dataset) # get a helper object for tensorboard logging log_dir = os.path.join(args.output_dir, args.run_name) stat_tracker = StatTracker(log_dir=log_dir) # get dataloaders for training and testing train_loader, test_loader, num_classes = \ build_dataset(dataset=dataset, batch_size=args.batch_size, input_dir=args.input_dir, labeled_only=args.classifiers) torch_device = torch.device('cuda') # create new model with random parameters model = Model(ndf=args.ndf, n_classes=num_classes, n_rkhs=args.n_rkhs, tclip=args.tclip, n_depth=args.n_depth, enc_size=enc_size, use_bn=(args.use_bn == 1)) model.init_weights(init_scale=1.0) # restore model parameters from a checkpoint if requested checkpoint = Checkpoint(model, args.cpt_load_path, args.output_dir, args.cpt_name) model = model.to(torch_device) # select which type of training to do task = train_classifiers if args.classifiers else train_self_supervised # do the real stuff... task(model, args.learning_rate, dataset, train_loader, test_loader, stat_tracker, checkpoint, args.output_dir, torch_device)
def set_checkpoints(self): pressed_keys = pygame.key.get_pressed() checkpoint_start = None checkpoint_id = 0 while (not pressed_keys[K_RETURN]): pressed_keys = pygame.key.get_pressed() self.on_render() for checkpoint in self._checkpoints: checkpoint.draw(self._display_surface) pygame.display.update() if checkpoint_start is not None: pygame.draw.line(self._display_surface, (255, 255, 0), checkpoint_start, pygame.mouse.get_pos(), width=3) pygame.display.update() for event in pygame.event.get(): left, _, _ = pygame.mouse.get_pressed() if left: if checkpoint_start is None: checkpoint_start = pygame.mouse.get_pos() else: checkpoint = Checkpoint(str(checkpoint_id), checkpoint_start, pygame.mouse.get_pos()) self._checkpoints.append(checkpoint) checkpoint_start = None checkpoint_id += 1 # Handle if the D key is pressed elif event.type == pygame.KEYDOWN and event.key == K_d: # If the D key was pressed and we were drawing a new checkpoint, cancel if checkpoint_start is not None: checkpoint_start = None continue # Otherwise - remove the last drawn checkpoint elif len(self._checkpoints) > 0: self._checkpoints = self._checkpoints[:-1]
def load_objects(self): for x, y in place_objects(CHECK): self.game_objects["check"].append( Checkpoint(x * 8, y * 8, self.plr)) for x, y in place_objects(SWITCH): self.game_objects["switch"].append(Switch(x * 8, y * 8)) for x, y in place_objects(BAD_ROBOT): self.game_objects["robot"].append(BadRobot(x * 8, y * 8, self.cam)) for x, y in place_objects(BADDER_ROBOT): self.game_objects["robot"].append( BadRobot(x * 8, y * 8, self.cam, True)) for x, y in place_objects(MOVING_PLATFORM): self.game_objects["platform"].append(MobilePlatform(x * 8, y * 8)) for x, y in place_objects(MOVING_PLATFORM_OPPOSITE): self.game_objects["platform"].append( MobilePlatform(x * 8, y * 8, True)) for x, y in place_objects(LASER): self.game_objects["laser"].append( Laser(x * 8, y * 8, self, self.cam, 3)) for x, y in place_objects(FAST_LASER): self.game_objects["laser"].append( Laser(x * 8, y * 8, self, self.cam, 2)) for x, y in place_objects(VERY_FAST_LASER): self.game_objects["laser"].append( Laser(x * 8, y * 8, self, self.cam, 1)) for id in range(1, GATE_IDS + 1): for x, y in place_objects(GATE_START_ADDRESS + id): self.game_objects["gate"].append(Gate(x * 8, y * 8, id))
# input_dir=args.input_dir, # labeled_only=True) num_classes = 10 torch_device = torch.device('cuda') # create new model with random parameters model = Model(ndf=args.ndf, n_classes=num_classes, n_rkhs=args.n_rkhs, tclip=args.tclip, n_depth=args.n_depth, enc_size=enc_size, use_bn=(args.use_bn == 1)) model.init_weights(init_scale=1.0) # restore model parameters from a checkpoint if requested checkpoint = Checkpoint(model, args.cpt_load_path, args.output_dir, args.cpt_name) model = model.to(torch_device) # select which type of training to do task = train_classifiers if args.classifiers else train_self_supervised ckpt = torch.load('/root/amdim-public/runs_stl64_norm_BN/cifar_amdim_cpt.pth') params = ckpt['model'] from collections import OrderedDict new_state_dict = OrderedDict() for k, v in params.items(): name = k.replace("module.", "") new_state_dict[name] = v # print(new_state_dict) model.load_state_dict(new_state_dict) # model.load_state_dict(params)
def main(): colorama.init() print("Thank you for using canvas_grab!") print( f"You are using version {VERSION}. If you have any questions, please file an issue at {Fore.BLUE}https://github.com/skyzh/canvas_grab/issues{Style.RESET_ALL}" ) print( f"You may review {Fore.GREEN}README(_zh-hans).md{Style.RESET_ALL} and {Fore.GREEN}LICENSE{Style.RESET_ALL} shipped with this release" ) config.load_config() if config.ENABLE_VIDEO: print( f"Note: You've enabled video download. You should install the required tools yourself." ) print( f" This is an experimental functionality and takes up large amount of bandwidth. {Fore.RED}Use at your own risk.{Style.RESET_ALL}" ) canvas = Canvas(config.API_URL, config.API_KEY) try: print(f'{Fore.BLUE}Logging in...{Style.RESET_ALL}') print( f"{Fore.GREEN}Logged in to {config.API_URL} as {canvas.get_current_user()}{Style.RESET_ALL}" ) except canvasapi.exceptions.InvalidAccessToken: print( f"{Fore.RED}Invalid access token, please check your config.API_KEY in config file" ) if is_windows(): # for windows double-click user input() exit() try: global checkpoint checkpoint = Checkpoint(config.CHECKPOINT_FILE) checkpoint.load() except FileNotFoundError: print(f"{Fore.RED}No checkpoint found{Style.RESET_ALL}") courses = [ course for course in canvas.get_courses() if hasattr(course, "name") ] if config.WHITELIST_CANVAS_ID: print(f"{Fore.BLUE}Whilelist mode enabled{Style.RESET_ALL}") courses = [ course for course in courses if course.id in config.WHITELIST_CANVAS_ID ] try: for course in courses: if course.start_at: delta = -(datetime.strptime( course.start_at, r'%Y-%m-%dT%H:%M:%S%z').replace( tzinfo=None) - datetime.now()).days else: delta = 0 if course.id in config.IGNORED_CANVAS_ID: print( f"{Fore.CYAN}Explicitly Ignored Course: {course.course_code}{Style.RESET_ALL}" ) elif config.RETAIN_COURSE_DAYS != 0 and delta > config.RETAIN_COURSE_DAYS: print( f"{Fore.CYAN}Outdated Course: {course.course_code}{Style.RESET_ALL}" ) else: try: process_course(course) except KeyboardInterrupt: raise except canvasapi.exceptions.Unauthorized as e: print( f"{Fore.RED}An error occoured when processing this course (unauthorized): {e}{Style.RESET_ALL}" ) except canvasapi.exceptions.ResourceDoesNotExist as e: print( f"{Fore.RED}An error occoured when processing this course (resourse not exist): {e}{Style.RESET_ALL}" ) if config.SCAN_STALE_FILE: scan_stale_files(courses) except KeyboardInterrupt: print( f"{Fore.RED}Terminated due to keyboard interrupt.{Style.RESET_ALL}" ) checkpoint.dump() if new_files_list: print( f"{Fore.GREEN}{len(new_files_list)} new or updated files:{Style.RESET_ALL}" ) for f in new_files_list: print(f" {f}") if updated_files_list: print( f"{Fore.GREEN}{len(updated_files_list)} files have a more recent version on Canvas:{Style.RESET_ALL}" ) for f in updated_files_list: print(f" {f}") if failure_file_list: print( f"{Fore.YELLOW}{len(failure_file_list)} files are not downloaded:{Style.RESET_ALL}" ) for f in failure_file_list: print(f" {f}") if not new_files_list and not updated_files_list: print("All files up to date") if config.ENABLE_VIDEO: print( f"{Fore.GREEN}{len(ffmpeg_commands)} videos resolved{Style.RESET_ALL}" ) print( f"Please run the automatically-generated script {Fore.BLUE}download_video.(sh/ps1){Style.RESET_ALL} to download all videos." ) with open("download_video.sh", 'w') as file: file.write("\n".join(ffmpeg_commands)) with open("download_video.ps1", 'w') as file: file.write("\n".join(ffmpeg_commands)) if config.ALLOW_VERSION_CHECK: check_latest_version() print(f"{Fore.GREEN}Done.{Style.RESET_ALL}") if is_windows(): # for windows double-click user input()
def __init__(self): self.bytes_count = 0 self.start_time = 0.0 self.out_files = {} self.checkpoint = Checkpoint(config.VALUE_SET) self.init_time()
def train_epochs(self, encoder, decoder, start_epoch,start_step, train_data, dev_data, end_epoch, log_file): #Prepare constants device = torch.device("cuda" if torch.cuda.is_available() else "cpu") params = {'batch_size': self.batch_size, 'shuffle': True, 'num_workers': 4, 'drop_last': True} #Prepare dataloader and remaining constants. training_data = DataLoader(train_data, **params) val_data = DataLoader(dev_data, **params) steps_per_epoch = len(training_data) step = start_step tot_steps = steps_per_epoch*(end_epoch - start_epoch) elapsed_steps = 0 for epoch in range(start_epoch,end_epoch): print("Epoch: {:d} Step: {:d}".format(epoch,step),file=open(log_file, 'a')) start = time.time() elapsed_steps = 0 epoch_loss_total = 0.0 reconstruction_accuracy_total = 0.0 loss_total = 0.0 KL_div_total = 0.0 for batch in training_data: batch = batch.to(device) loss, reconstruction_accuracy, KL_div = self.train_batch(step, encoder, decoder, batch, self.inverse_sigmoid(step)) loss_total += loss epoch_loss_total += loss reconstruction_accuracy_total += reconstruction_accuracy KL_div_total += KL_div step += 1 elapsed_steps += 1 if step%self.print_every == 0: if elapsed_steps > self.print_every: cnt = self.print_every else: cnt = elapsed_steps loss_avg = loss_total /cnt reconstruction_accuracy_avg = reconstruction_accuracy_total/cnt KL_div_avg = KL_div_total/cnt loss_total = 0 reconstruction_accuracy_total = 0 KL_div_total = 0 print(("Progress: {:.2f}%" " Average Loss: {:2.2f}" " Reconstruction Accuracy: {:2.2f}%" " KL Divergence: {:2.2f}" " ").format((elapsed_steps / steps_per_epoch)*100, loss_avg,reconstruction_accuracy_avg,KL_div_avg),file=open(log_file, 'a')) if step%self.checkpoint_every == 0: print("Trying to checkpoint.") Checkpoint( encoder = encoder, decoder = decoder, epoch = epoch, step = step, optimizer = self.optimizer, scheduler = self.scheduler, samp_rate = self.samp_rate, KL_rate = self.KL_rate, free_bits = self.free_bits ).save(self.exp_dir) print("Checkpoint successful!") print("End of epoch. Time elapsed: " + timer(start, time.time()), file=open(log_file, 'a')) print("Average loss for this epoch: {:2.2f} ".format(epoch_loss_total/elapsed_steps), file=open(log_file, 'a')) Checkpoint( encoder = encoder, decoder = decoder, epoch = epoch+1, step = step, optimizer = self.optimizer, scheduler = self.scheduler, samp_rate = self.samp_rate, KL_rate = self.KL_rate, free_bits = self.free_bits ).save(self.exp_dir) #Now, compute validation. with torch.no_grad(): reconstruction_accuracy_val = 0.0 reconstruction_accuracy_val_nf = 0.0 val_loss = 0.0 val_KL_tot = 0.0 val_loss_nf = 0.0 val_KL_tot_nf = 0.0 count = 0 for val_batch in val_data: count += 1 val_batch = val_batch.to(device) batch_loss, batch_accuracy, val_KL = self.loss(step, encoder,decoder, val_batch, 1) batch_loss_nf, batch_accuracy_nf, val_KL_nf = self.loss(step, encoder, decoder, val_batch, 0) val_loss += batch_loss reconstruction_accuracy_val += batch_accuracy val_KL_tot += val_KL val_loss_nf += batch_loss_nf reconstruction_accuracy_val_nf += batch_accuracy_nf val_KL_tot_nf += val_KL_nf reconstruction_accuracy_val /= count val_loss /= count val_KL_tot /= count reconstruction_accuracy_val_nf /= count val_loss_nf /= count val_KL_tot_nf /= count print("Validation results: ", file=open(log_file, 'a')) print("Reconstruction Accuracy: {:2.2f}%" " Loss (Validation): {:2.2f}" " KL Divergence {:2.2f}".format(100*reconstruction_accuracy_val,val_loss,val_KL_tot), file=open(log_file, 'a')) print("Reconstruction Accuracy (NF): {:2.2f}%" " Loss (NF): {:2.2f}" " KL Divergence (NF) {:2.2f}".format(100*reconstruction_accuracy_val_nf,val_loss_nf,val_KL_tot_nf), file=open(log_file, 'a'))
def _train_gan(self): """ TODO: Add in autoencoder to perform dimensionality reduction on data TODO: Not working yet - trying to work out good autoencoder model first :return: """ criterion = nn.BCELoss() discriminator_optimiser = optim.Adam(self.discriminator.parameters(), lr=0.003, betas=(0.5, 0.999)) discriminator_scheduler = optim.lr_scheduler.LambdaLR( discriminator_optimiser, lambda epoch: 0.97**epoch) discriminator_checkpoint = Checkpoint("discriminator") discriminator_epoch = 0 if discriminator_checkpoint.load(): discriminator_epoch = self.load_state(discriminator_checkpoint, self.discriminator, discriminator_optimiser) else: LOG.info('Discriminator checkpoint not found') generator_optimiser = optim.Adam(self.generator.parameters(), lr=0.003, betas=(0.5, 0.999)) generator_scheduler = optim.lr_scheduler.LambdaLR( generator_optimiser, lambda epoch: 0.97**epoch) generator_checkpoint = Checkpoint("generator") generator_epoch = 0 if generator_checkpoint.load(): generator_epoch = self.load_state(generator_checkpoint, self.generator, generator_optimiser) else: LOG.info('Generator checkpoint not found') if discriminator_epoch is None or generator_epoch is None: epoch = 0 LOG.info( "Discriminator or generator failed to load, training from start" ) else: epoch = min(generator_epoch, discriminator_epoch) LOG.info("Generator loaded at epoch {0}".format(generator_epoch)) LOG.info("Discriminator loaded at epoch {0}".format( discriminator_epoch)) LOG.info("Training from lowest epoch {0}".format(epoch)) vis_path = os.path.join( os.path.splitext(self.config.FILENAME)[0], "gan", str(datetime.now())) with Visualiser(vis_path) as vis: real_labels = None # all 1s fake_labels = None # all 0s epochs_complete = 0 while epoch < self.config.MAX_EPOCHS: if self.check_requeue(epochs_complete): return # Requeue needed and training not complete for step, (data, noise1, noise2) in enumerate(self.data_loader): batch_size = data.size(0) if real_labels is None or real_labels.size( 0) != batch_size: real_labels = self.generate_labels(batch_size, [1.0]) if fake_labels is None or fake_labels.size( 0) != batch_size: fake_labels = self.generate_labels(batch_size, [0.0]) if self.config.USE_CUDA: data = data.cuda() noise1 = noise1.cuda() noise2 = noise2.cuda() # ============= Train the discriminator ============= # Pass real noise through first - ideally the discriminator will return 1 #[1, 0] d_output_real = self.discriminator(data) # Pass generated noise through - ideally the discriminator will return 0 #[0, 1] d_output_fake1 = self.discriminator(self.generator(noise1)) # Determine the loss of the discriminator by adding up the real and fake loss and backpropagate d_loss_real = criterion( d_output_real, real_labels ) # How good the discriminator is on real input d_loss_fake = criterion( d_output_fake1, fake_labels ) # How good the discriminator is on fake input d_loss = d_loss_real + d_loss_fake self.discriminator.zero_grad() d_loss.backward() discriminator_optimiser.step() # =============== Train the generator =============== # Pass in fake noise to the generator and get it to generate "real" noise # Judge how good this noise is with the discriminator d_output_fake2 = self.discriminator(self.generator(noise2)) # Determine the loss of the generator using the discriminator and backpropagate g_loss = criterion(d_output_fake2, real_labels) self.discriminator.zero_grad() self.generator.zero_grad() g_loss.backward() generator_optimiser.step() vis.step(d_loss_real.item(), d_loss_fake.item(), g_loss.item()) # Report data and save checkpoint fmt = "Epoch [{0}/{1}], Step[{2}/{3}], d_loss_real: {4:.4f}, d_loss_fake: {5:.4f}, g_loss: {6:.4f}" LOG.info( fmt.format(epoch + 1, self.config.MAX_EPOCHS, step + 1, len(self.data_loader), d_loss_real, d_loss_fake, g_loss)) epoch += 1 epochs_complete += 1 discriminator_checkpoint.set( self.discriminator.state_dict(), discriminator_optimiser.state_dict(), epoch).save() generator_checkpoint.set(self.generator.state_dict(), generator_optimiser.state_dict(), epoch).save() vis.plot_training(epoch) data, noise1, _ = iter(self.data_loader).__next__() if self.config.USE_CUDA: data = data.cuda() noise1 = noise1.cuda() vis.test(epoch, self.data_loader.get_input_size_first(), self.discriminator, self.generator, noise1, data) generator_scheduler.step(epoch) discriminator_scheduler.step(epoch) LOG.info("Learning rates: d {0} g {1}".format( discriminator_optimiser.param_groups[0]["lr"], generator_optimiser.param_groups[0]["lr"])) LOG.info("GAN Training complete")
def _train_autoencoder(self): """ Main training loop for the autencoder. This function will return False if: - Loading the autoencoder succeeded, but the NN model did not load the state dicts correctly. - The script needs to be re-queued because the NN has been trained for REQUEUE_EPOCHS :return: True if training was completed, False if training needs to continue. :rtype bool """ criterion = nn.SmoothL1Loss() optimiser = optim.Adam(self.generator.parameters(), lr=0.00003, betas=(0.5, 0.999)) checkpoint = Checkpoint("autoencoder") epoch = 0 if checkpoint.load(): epoch = self.load_state(checkpoint, self.autoencoder, optimiser) if epoch is not None and epoch >= self.config.MAX_AUTOENCODER_EPOCHS: LOG.info("Autoencoder already trained") return True else: LOG.info( "Autoencoder training beginning from epoch {0}".format( epoch)) else: LOG.info('Autoencoder checkpoint not found. Training from start') # Train autoencoder self._autoencoder.set_mode(Autoencoder.Mode.AUTOENCODER) vis_path = os.path.join( os.path.splitext(self.config.FILENAME)[0], "autoencoder", str(datetime.now())) with Visualiser(vis_path) as vis: epochs_complete = 0 while epoch < self.config.MAX_AUTOENCODER_EPOCHS: if self.check_requeue(epochs_complete): return False # Requeue needed and training not complete for step, (data, _, _) in enumerate(self.data_loader): if self.config.USE_CUDA: data = data.cuda() if self.config.ADD_DROPOUT: # Drop out parts of the input, but compute loss on the full input. out = self.autoencoder(nn.functional.dropout( data, 0.5)) else: out = self.autoencoder(data) loss = criterion(out.cpu(), data.cpu()) self.autoencoder.zero_grad() loss.backward() optimiser.step() vis.step_autoencoder(loss.item()) # Report data and save checkpoint fmt = "Epoch [{0}/{1}], Step[{2}/{3}], loss: {4:.4f}" LOG.info( fmt.format(epoch + 1, self.config.MAX_AUTOENCODER_EPOCHS, step, len(self.data_loader), loss)) epoch += 1 epochs_complete += 1 checkpoint.set(self.autoencoder.state_dict(), optimiser.state_dict(), epoch).save() LOG.info("Plotting autoencoder progress") vis.plot_training(epoch) data, _, _ = iter(self.data_loader).__next__() vis.test_autoencoder(epoch, self.autoencoder, data.cuda()) LOG.info("Autoencoder training complete") return True # Training complete
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, nprocs, policy_hid_list, valfunc_hid_list, gpu_pct, restore_path, animate, submit): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ # killer = GracefulKiller() env, obs_dim, act_dim = init_osim(animate) env.seed(111 + mpi_util.rank) mpi_util.set_global_seeds(111 + mpi_util.rank) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories if mpi_util.rank == 0: #aigym_path = os.path.join('/tmp', env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) logger = Logger(logname=env_name, now=now) episode = 0 checkpoint = Checkpoint("saves", now) # restore from checkpoint? if restore_path: (policy, val_func, scaler, episode, obs_dim, act_dim, kl_targ) = checkpoint.restore(restore_path) else: policy = Policy(obs_dim, act_dim, kl_targ) val_func = NNValueFunction(obs_dim) scaler = Scaler(obs_dim) if mpi_util.rank == 0: # run a few episodes (on node 0) of untrained policy to initialize scaler: trajectories = run_policy(env, policy, scaler, episodes=5) unscaled = np.concatenate( [t['unscaled_obs'] for t in trajectories]) scaler.update( unscaled) # update running statistics for scaling observations # broadcast policy weights, scaler, val_func (policy, scaler, val_func) = mpi_util.broadcast_policy_scaler_val( policy, scaler, val_func) if mpi_util.rank == 0: checkpoint.save(policy, val_func, scaler, episode) if animate: observes, actions, rewards, unscaled_obs = run_episode(env, policy, scaler, animate=animate) exit(0) if submit: # Settings #remote_base = 'http://grader.crowdai.org:1729' remote_base = 'http://grader.crowdai.org:1730' token = 'a83412a94593cae3a491f3ee28ff44e1' client = Client(remote_base) # Create environment observation = client.env_create(token) step = 0.0 observes, actions, rewards, unscaled_obs = [], [], [], [] scale, offset = scaler.get() scale[-1] = 1.0 # don't scale time step feature offset[-1] = 0.0 # don't offset time step feature # Run a single step # # The grader runs 3 simulations of at most 1000 steps each. We stop after the last one while True: obs = np.array(observation).astype(np.float32).reshape((1, -1)) print("OBSERVATION TYPE:", type(obs), obs.shape) print(obs) obs = np.append(obs, [[step]], axis=1) # add time step feature unscaled_obs.append(obs) obs = (obs - offset) * scale # center and scale observations observes.append(obs) action = policy.sample(obs).astype(np.float32).reshape((-1, 1)) print("ACTION TYPE:", type(action), action.shape) print(action) actions.append(action) [observation, reward, done, info] = client.env_step(action.tolist()) print("step:", step, "reward:", reward) if not isinstance(reward, float): reward = np.asscalar(reward) rewards.append(reward) step += 1e-3 # increment time step feature if done: print( "================================== RESTARTING =================================" ) observation = client.env_reset() step = 0.0 observes, actions, rewards, unscaled_obs = [], [], [], [] scale, offset = scaler.get() scale[-1] = 1.0 # don't scale time step feature offset[-1] = 0.0 # don't offset time step feature if not observation: break client.submit() exit(0) ###### worker_batch_size = int(batch_size / mpi_util.nworkers) # HACK if (worker_batch_size * mpi_util.nworkers != batch_size): print("batch_size:", batch_size, " is not divisible by nworkers:", mpi_util.nworkers) exit(1) batch = 0 while episode < num_episodes: if mpi_util.rank == 0 and batch > 0 and batch % 10 == 0: checkpoint.save(policy, val_func, scaler, episode) batch = batch + 1 trajectories = run_policy(env, policy, scaler, episodes=worker_batch_size) trajectories = mpi_util.gather_trajectories(trajectories) if mpi_util.rank == 0: # concatentate trajectories into one list trajectories = list(itertools.chain.from_iterable(trajectories)) print("did a batch of ", len(trajectories), " trajectories") print([t['rewards'].sum() for t in trajectories]) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: logger.log({ '_MeanReward': np.mean([t['rewards'].sum() for t in trajectories]), 'Steps': np.sum([t['observes'].shape[0] for t in trajectories]) }) log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function unscaled = np.concatenate( [t['unscaled_obs'] for t in trajectories]) scaler.update( unscaled) # update running statistics for scaling observations logger.write( display=True) # write logger results to file and stdout # if mpi_util.rank == 0 and killer.kill_now: # if input('Terminate training (y/[n])? ') == 'y': # break # killer.kill_now = False # broadcast policy weights, scaler, val_func (policy, scaler, val_func) = mpi_util.broadcast_policy_scaler_val( policy, scaler, val_func) if mpi_util.rank == 0: logger.close() policy.close_sess() if mpi_util.rank == 0: val_func.close_sess()
''' Touts les éléments du niveau ''' murs = [Mur((0, 400)), Mur((71, 400)), Mur((400, 550))] plateformes = [Plateforme((142, 450)), Plateforme((142, 250))] obstacles = [ObstacleTest((0, 0)), ObstacleTest((100, 0))] ennemis = [Ennemi1((1000, 200)), Ennemi2((1200, 200))] boss = Boss((1800, 0)) checkpoints = [ Checkpoint("red", (200, 530)), Checkpoint("blue", (300, 530)), Checkpoint("green", (400, 530)), Checkpoint("yellow", (500, 530)) ] joueur = (0, 200) ''' niveau La structure d'un niveau @type {Dictionnaire} ''' niveau = { 'murs': murs,
import utility import data import model import loss from option import args from checkpoint import Checkpoint from trainer import Trainer utility.set_seed(args.seed) # 设置随机种子,方便结果复现 checkpoint = Checkpoint(args) if checkpoint.ok: loader = data.Data(args) model = model.Model(args, checkpoint) loss = loss.Loss(args, checkpoint) if not args.test_only else None t = Trainer(args, loader, model, loss, checkpoint) while not t.terminate(): t.train() t.test() checkpoint.done()
def training(edit_net, nepochs, args, vocab, print_every=100, check_every=500, test=False): if test: print(args.data_path + 'test.df.filtered.pos') eval_dataset = data.Dataset( args.data_path + 'test.df.filtered.pos') # load eval dataset else: print(args.data_path + 'val.df.filtered.pos') eval_dataset = data.Dataset(args.data_path + 'val.df.filtered.pos') # load eval dataset evaluator = Evaluator( loss=nn.NLLLoss(ignore_index=vocab.w2i['PAD'], reduction='none')) editnet_optimizer = torch.optim.Adam(edit_net.parameters(), lr=1e-3, weight_decay=1e-6) # scheduler = MultiStepLR(abstract_optimizer, milestones=[20,30,40], gamma=0.1) # abstract_scheduler = ReduceLROnPlateau(abstract_optimizer, mode='max') # uncomment this part to re-weight different operations # NLL_weight = reweight_global_loss(args.w_add, args.w_keep, args.w_del) # NLL_weight_t = torch.from_numpy(NLL_weight).float().cuda() # editnet_criterion = nn.NLLLoss(weight=NLL_weight_t, ignore_index=vocab.w2i['PAD'], reduce=False) editnet_criterion = nn.NLLLoss(ignore_index=vocab.w2i['PAD'], reduction='none') best_eval_loss = 0. # init statistics print_loss = [] # Reset every print_every for epoch in range(nepochs): # scheduler.step() #reload training for every epoch if os.path.isfile(args.data_path + 'train.df.filtered.pos'): train_dataset = data.Dataset(args.data_path + 'train.df.filtered.pos') else: # iter chunks and vocab_data train_dataset = data.Datachunk(args.data_path + 'train.df.filtered.pos') for i, batch_df in train_dataset.batch_generator( batch_size=args.batch_size, shuffle=True): # time1 = time.time() prepared_batch, syn_tokens_list = data.prepare_batch( batch_df, vocab, args.max_seq_len) #comp,scpn,simp # a batch of complex tokens in vocab ids, sorted in descending order org_ids = prepared_batch[0] org_lens = org_ids.ne(0).sum(1) org = sort_by_lens( org_ids, org_lens ) # inp=[inp_sorted, inp_lengths_sorted, inp_sort_order] # a batch of pos-tags in pos-tag ids for complex org_pos_ids = prepared_batch[1] org_pos_lens = org_pos_ids.ne(0).sum(1) org_pos = sort_by_lens(org_pos_ids, org_pos_lens) out = prepared_batch[2][:, :] tar = prepared_batch[2][:, 1:] simp_ids = prepared_batch[3] editnet_optimizer.zero_grad() output = edit_net(org, out, org_ids, org_pos, simp_ids) ##################calculate loss tar_lens = tar.ne(0).sum(1).float() tar_flat = tar.contiguous().view(-1) loss = editnet_criterion(output.contiguous().view(-1, vocab.count), tar_flat).contiguous() loss[tar_flat == 1] = 0 #remove loss for UNK loss = loss.view(tar.size()) loss = loss.sum(1).float() loss = loss / tar_lens loss = loss.mean() print_loss.append(loss.item()) loss.backward() torch.nn.utils.clip_grad_norm_(edit_net.parameters(), 1.) editnet_optimizer.step() if i % print_every == 0: log_msg = 'Epoch: %d, Step: %d, Loss: %.4f' % ( epoch, i, np.mean(print_loss)) print_loss = [] print(log_msg) # Checkpoint if i % check_every == 0: edit_net.eval() val_loss, bleu_score, sari, sys_out = evaluator.evaluate( eval_dataset, vocab, edit_net, args) log_msg = "epoch %d, step %d, Dev loss: %.4f, Bleu score: %.4f, Sari: %.4f \n" % ( epoch, i, val_loss, bleu_score, sari) print(log_msg) if val_loss < best_eval_loss: best_eval_loss = val_loss Checkpoint( model=edit_net, opt=editnet_optimizer, epoch=epoch, step=i, ).save(args.store_dir) print("checked after %d steps" % i) edit_net.train() print(edit_net) return edit_net
def __init__(self): self.walls = [] # Outer boundaries self.walls.append(Boundary(100, 200, 100, 180)) self.walls.append(Boundary(100, 180, 100, 100)) self.walls.append(Boundary(100, 100, 200, 50)) self.walls.append(Boundary(200, 50, 415, 90)) self.walls.append(Boundary(415, 90, 531, 20)) self.walls.append(Boundary(531, 20, 622, 73)) self.walls.append(Boundary(622, 73, 591, 220)) self.walls.append(Boundary(591, 220, 405, 262)) self.walls.append(Boundary(405, 262, 322, 367)) self.walls.append(Boundary(322, 367, 458, 475)) self.walls.append(Boundary(458, 475, 563, 350)) self.walls.append(Boundary(563, 350, 713, 287)) self.walls.append(Boundary(713, 287, 867, 62)) self.walls.append(Boundary(867, 62, 1120, 118)) self.walls.append(Boundary(1120, 118, 1242, 325)) self.walls.append(Boundary(1242, 325, 1057, 416)) self.walls.append(Boundary(1057, 416, 1067, 479)) self.walls.append(Boundary(1067, 479, 1253, 559)) self.walls.append(Boundary(1253, 559, 1249, 643)) self.walls.append(Boundary(1249, 643, 1109, 700)) self.walls.append(Boundary(1109, 700, 882, 692)) self.walls.append(Boundary(882, 692, 833, 584)) self.walls.append(Boundary(833, 584, 657, 577)) self.walls.append(Boundary(657, 577, 497, 671)) self.walls.append(Boundary(497, 671, 164, 636)) self.walls.append(Boundary(164, 636, 70, 465)) self.walls.append(Boundary(70, 465, 126, 318)) self.walls.append(Boundary(126, 318, 100, 200)) # Inner boundaries self.walls.append(Boundary(200, 200, 200, 180)) self.walls.append(Boundary(200, 180, 186, 137)) self.walls.append(Boundary(186, 137, 221, 120)) self.walls.append(Boundary(221, 120, 424, 165)) self.walls.append(Boundary(424, 165, 525, 92)) self.walls.append(Boundary(525, 92, 553, 116)) self.walls.append(Boundary(553, 116, 540, 186)) self.walls.append(Boundary(540, 186, 323, 207)) self.walls.append(Boundary(323, 207, 239, 375)) self.walls.append(Boundary(239, 375, 455, 557)) self.walls.append(Boundary(455, 557, 616, 445)) self.walls.append(Boundary(616, 445, 802, 330)) self.walls.append(Boundary(802, 330, 914, 172)) self.walls.append(Boundary(914, 172, 1061, 179)) self.walls.append(Boundary(1061, 179, 1100, 284)) self.walls.append(Boundary(1100, 284, 956, 378)) self.walls.append(Boundary(956, 378, 984, 508)) self.walls.append(Boundary(984, 508, 1152, 606)) self.walls.append(Boundary(1152, 606, 1152, 634)) self.walls.append(Boundary(1152, 634, 1061, 645)) self.walls.append(Boundary(1061, 645, 959, 620)) self.walls.append(Boundary(959, 620, 893, 526)) self.walls.append(Boundary(893, 526, 616, 529)) self.walls.append(Boundary(616, 529, 473, 602)) self.walls.append(Boundary(473, 602, 249, 575)) self.walls.append(Boundary(249, 575, 186, 466)) self.walls.append(Boundary(186, 466, 214, 319)) self.walls.append(Boundary(214, 319, 200, 200)) self.checkpoints = [] self.checkpoints.append(Checkpoint(100, 200, 200, 200, 0)) self.checkpoints.append(Checkpoint(100, 180, 200, 180, 1)) self.checkpoints.append(Checkpoint(100, 100, 186, 137, 2)) self.checkpoints.append(Checkpoint(200, 50, 221, 120, 3)) self.checkpoints.append(Checkpoint(415, 90, 424, 165, 4)) self.checkpoints.append(Checkpoint(531, 20, 525, 92, 5)) self.checkpoints.append(Checkpoint(622, 73, 553, 116, 6)) self.checkpoints.append(Checkpoint(591, 220, 540, 186, 7)) self.checkpoints.append(Checkpoint(405, 262, 323, 207, 8)) self.checkpoints.append(Checkpoint(322, 367, 239, 375, 9)) self.checkpoints.append(Checkpoint(458, 475, 455, 557, 10)) self.checkpoints.append(Checkpoint(563, 350, 616, 445, 11)) self.checkpoints.append(Checkpoint(713, 287, 802, 330, 12)) self.checkpoints.append(Checkpoint(867, 62, 914, 172, 13)) self.checkpoints.append(Checkpoint(1120, 118, 1061, 179, 14)) self.checkpoints.append(Checkpoint(1242, 325, 1100, 284, 15)) self.checkpoints.append(Checkpoint(1057, 416, 956, 378, 16)) self.checkpoints.append(Checkpoint(1067, 479, 984, 508, 17)) self.checkpoints.append(Checkpoint(1253, 559, 1152, 606, 18)) self.checkpoints.append(Checkpoint(1249, 643, 1152, 634, 19)) self.checkpoints.append(Checkpoint(1109, 700, 1061, 645, 20)) self.checkpoints.append(Checkpoint(882, 692, 959, 620, 21)) self.checkpoints.append(Checkpoint(833, 584, 893, 526, 22)) self.checkpoints.append(Checkpoint(657, 577, 616, 529, 23)) self.checkpoints.append(Checkpoint(497, 671, 473, 602, 24)) self.checkpoints.append(Checkpoint(164, 636, 249, 575, 25)) self.checkpoints.append(Checkpoint(70, 465, 186, 466, 26)) self.checkpoints.append(Checkpoint(126, 318, 214, 319, 27))
def main(_): checkpoint = Checkpoint(FLAGS.checkpoint_dir) utils.exists_or_mkdir(FLAGS.sample_dir) utils.exists_or_mkdir(FLAGS.log_dir) summaryWriter = tensorboardX.SummaryWriter(log_dir = FLAGS.log_dir)#torch.utils.tensorboard.SummaryWriter(log_dir = FLAGS.log_dir) logger.info('[Params] lr:%f, size:%d, dataset:%s, av_gen:%d, n_disc:%d'% (FLAGS.learning_rate, FLAGS.output_size, FLAGS.dataset, int(FLAGS.use_averaged_gen), FLAGS.n_discriminator)) #dataset z_shape = (FLAGS.z_dim,) image_size = (FLAGS.output_size, FLAGS.output_size) image_shape = (3,) + image_size ds = dataset.datasets.from_name(name=FLAGS.dataset, data_folder=FLAGS.data_folder, output_size=image_size) batch = batch_gen.BatchWithNoise(ds, batch_size=FLAGS.batch_size, z_shape=z_shape,num_workers=10) #initialize device device = utils.get_torch_device() #model nn_model = models.model_factory.create_model(FLAGS.model_name, device=device, image_shape=image_shape, z_shape=z_shape, use_av_gen=FLAGS.use_averaged_gen, g_tanh=False) nn_model.register_checkpoint(checkpoint) loss = gan_loss.js_loss() #lambd = lambda_scheduler.Constant(0.1) lambd = lambda_scheduler.ThresholdAnnealing(1000., threshold=loss.lambda_switch_level, min_switch_step=FLAGS.lambda_switch_steps, verbose=True) checkpoint.register('lambda', lambd, True) trainer = Trainer(model=nn_model, batch=batch, loss=loss, lr=FLAGS.learning_rate, reg='gp', lambd=lambd) trainer.sub_batches = FLAGS.batch_per_update trainer.register_checkpoint(checkpoint) it_start = checkpoint.load(FLAGS.checkpoint_it_to_load) trainer.update_lr() ##========================= LOAD CONTEXT ================================## context_path = os.path.join(FLAGS.checkpoint_dir, 'context.npz') sample_seed = None if os.path.exists(context_path): sample_seed = np.load(context_path)['z'] if sample_seed.shape[0] != FLAGS.sample_size or sample_seed.shape[1] != FLAGS.z_dim: sample_seed = None logger.info('Invalid sample seed') else: logger.info('Sample seed loaded') if sample_seed is None: sample_seed = batch.sample_z(FLAGS.sample_size).data.numpy() np.savez(context_path, z = sample_seed) ##========================= TRAIN MODELS ================================## batches_per_epoch = 10000 total_time = 0 bLambdaSwitched = (it_start == 0) n_too_good_d = [] number_of_iterations = FLAGS.epoch*batches_per_epoch for it in range(number_of_iterations): start_time = time.time() iter_counter = it + it_start # updates the discriminator #if iter_counter < 25 or iter_counter % 500 == 0: # d_iter = 20 #else: # d_iter = 5 if bLambdaSwitched: #if lambda was switched we want to keep discriminator optimal logger.info('[!] Warming up discriminator') d_iter = 25 else: d_iter = FLAGS.n_discriminator # errD, s, errG, b_too_good_D = trainer.update(d_iter, 1) summaryWriter.add_scalar('d_loss', errD, iter_counter) summaryWriter.add_scalar('slope', s, iter_counter) summaryWriter.add_scalar('g_loss', errG, iter_counter) summaryWriter.add_scalar('loss', errD + float(lambd) * s**2, iter_counter) summaryWriter.add_scalar('lambda', float(lambd), iter_counter) #updating lambda n_too_good_d.append(b_too_good_D) if len(n_too_good_d) > 20: del n_too_good_d[0] bLambdaSwitched = lambd.update(errD) if not bLambdaSwitched and sum(n_too_good_d) > 10: bLambdaSwitched = lambd.switch() end_time = time.time() iter_time = end_time - start_time total_time += iter_time logger.info("[%2d/%2d] time: %4.4f, d_loss: %.8f, s: %.4f, g_loss: %.8f" % (iter_counter, it_start + number_of_iterations, iter_time, errD, s, errG)) if np.mod(iter_counter, FLAGS.sample_step) == 0 and it > 0: n = int(np.sqrt(FLAGS.sample_size)) img = trainer.sample(sample_seed) img = img.data.cpu() img_tb = utils.image_to_tensorboard(torchvision.utils.make_grid(img, n)) summaryWriter.add_image('samples',img_tb, iter_counter) utils.save_images(img.data.cpu().numpy(), [n, n], './{}/train_{:02d}.png'.format(FLAGS.sample_dir, iter_counter)) if np.mod(iter_counter, FLAGS.save_step) == 0 and it > 0: checkpoint.save(iter_counter) checkpoint.save(iter_counter)
def _train_epochs(self, data, model, n_epochs, start_epoch, start_step, dev_data=None, teacher_forcing_ratio=0): log = self.logger print_loss_total = 0 # Reset every print_every epoch_loss_total = 0 # Reset every epoch device = None if torch.cuda.is_available() else -1 batch_iterator = torchtext.data.BucketIterator( dataset=data, batch_size=self.batch_size, sort=False, sort_within_batch=True, sort_key=lambda x: len(x.src), device=device, repeat=False) steps_per_epoch = len(batch_iterator) total_steps = steps_per_epoch * n_epochs step = start_step step_elapsed = 0 best_acc = None save_flag = False for epoch in range(start_epoch, n_epochs + 1): log.debug("Epoch: %d, Step: %d" % (epoch, step)) batch_generator = batch_iterator.__iter__() for _ in range((epoch - 1) * steps_per_epoch, step): next(batch_generator) model.train(True) for batch in batch_generator: step += 1 step_elapsed += 1 input_variables, input_lengths = getattr( batch, GlobalNames.src_field_name) target_variables = getattr(batch, GlobalNames.tgt_field_name) loss = self._train_batch(input_variables, input_lengths.tolist(), target_variables, model, teacher_forcing_ratio) # Record average loss print_loss_total += loss epoch_loss_total += loss if step % self.print_every == 0 and step_elapsed > self.print_every: print_loss_avg = print_loss_total / self.print_every print_loss_total = 0 log_msg = 'Progress: %d%%, Train %s: %.4f' % ( step / total_steps * 100, self.loss.name, print_loss_avg) log.info(log_msg) if step % self.checkpoint_every == 0 or step == total_steps: if dev_data is not None: dev_loss, accuracy = self.evaluator.evaluate2( model, dev_data) valid_score.append(accuracy) write_docs(GlobalNames.valid_result, valid_score) self.optimizer.update(dev_loss, epoch) log_msg = "Dev %s: %.4f, Accuracy: %s" % ( self.loss.name, dev_loss, accuracy) model.train(mode=True) if best_acc is not None: if accuracy > best_acc: save_flag = True best_acc = accuracy else: best_acc = accuracy save_flag = True else: save_flag = True self.optimizer.update(epoch_loss_avg, epoch) if save_flag: print("***saving model in {} with {} ***".format( self.expt_dir, best_acc)) Checkpoint(model=model, optimizer=self.optimizer, epoch=epoch, step=step, input_vocab=data.fields[ GlobalNames.src_field_name].vocab, output_vocab=data.fields[ GlobalNames.tgt_field_name].vocab).save( self.expt_dir) save_flag = False log.info(log_msg) if step_elapsed == 0: continue epoch_loss_avg = epoch_loss_total / min(steps_per_epoch, step - start_step) epoch_loss_total = 0 log_msg = "Finished epoch %d: Train %s: %.4f" % ( epoch, self.loss.name, epoch_loss_avg) log.info(log_msg)
def checkpoint(doc=None): from checkpoint import Checkpoint if not doc: doc = document() return Checkpoint(doc, {'x': 'y'}, 0, 'mick-and-bandit')
def setUp(self): os.mkdir(TestActionsStateMachine.TEST_DIR) self.cp = Checkpoint(TestActionsStateMachine.TEST_DIR) self.cp.createCheckpointLog(TestActionsStateMachine.TEST_KEY)
def main(args): configure(os.path.join(args['exp_dir'], 'log_dir')) transform = transforms.Compose([ transforms.RandomCrop(args['crop_size']), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) data_loader = get_loader({ 'data_dir': args['data_dir'], 'exp_dir': args['exp_dir'], 'raw_data_dir': args['raw_data_dir'], 'batch_size': args['batch_size'], 'transform': transform, 'num_workers': args['num_workers'], 'shuffle': args['shuffle'], 'mode': 'train' }) # valid_data_loader=get_loader({'data_dir' : args['data_dir'], # 'raw_data_dir' : args['raw_data_dir'], # 'batch_size' : int(args['batch_size']/4), # 'transform' : transform, # 'num_workers' : args['num_workers'], # 'shuffle' : args['shuffle'], # 'mode':'validate'}) args['vocab_size'] = len(Vocabulary.load_vocab(args['exp_dir'])) encoder = EncoderCNN(args).train() decoder = DecoderRNN(args).train() if args['pretrained']: checkpoint_path = Checkpoint.get_latest_checkpoint(args['exp_dir']) checkpoint = Checkpoint.load(checkpoint_path) encoder.load_state_dict(checkpoint.encoder) decoder.load_state_dict(checkpoint.decoder) step = checkpoint.step epoch = checkpoint.epoch omit = True else: step = 0 epoch = 0 omit = False encoder.to(device) decoder.to(device) criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) # params=list(decoder.parameters()) + list(encoder.parameters()) optimizer = torch.optim.Adam(params, lr=args['lr']) scheduler = StepLR(optimizer, step_size=40, gamma=0.1) # optimizer=YFOptimizer(params) total_step = len(data_loader) min_valid_loss = float('inf') for epoch in range(epoch, args['num_epochs']): scheduler.step() for idx, (images, captions, leng) in enumerate(data_loader): if omit: if idx < (step - total_step * epoch): logger.info( 'idx:{},step:{}, epoch:{}, total_step:{}, diss:{}'. format(idx, step, epoch, total_step, step - total_step * epoch)) continue else: omit = False images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, leng, batch_first=True)[0] features = encoder(images) outputs = decoder(features, captions, leng) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(decoder.parameters(), 5) optimizer.step() log_value('loss', loss.item(), step) step += 1 if step % args['log_step'] == 0: logger.info( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args['num_epochs'], idx, total_step, loss.item(), np.exp(loss.item()))) if step % args['valid_step'] == 0: # valid_loss=validate(encoder.eval(),decoder,criterion,valid_data_loader) # if valid_loss<min_valid_loss: # min_valid_loss=valid_loss Checkpoint(encoder, decoder, optimizer, epoch, step).save(args['exp_dir'])
def _train_epoches(self, data, model, n_epochs, start_epoch, start_step, dev_data=None, teacher_forcing_ratio=0): log = self.logger print_loss_total = 0 # Reset every print_every epoch_loss_total = 0 # Reset every epoch device = None if torch.cuda.is_available() else -1 log.debug("Data Desc: Examples Len:{}, Fields Len:{}".format(len(data.examples),len(data.fields.items()))) batch_iterator = torchtext.data.BucketIterator(dataset=data, batch_size=self.batch_size, sort=False, sort_within_batch=True, sort_key=lambda x: len(x.src), device=device, repeat=False) steps_per_epoch = len(batch_iterator) total_steps = steps_per_epoch * n_epochs log.debug("Steps per Epoch:{}".format(steps_per_epoch)) step = start_step step_elapsed = 0 for epoch in range(start_epoch, n_epochs + 1): log.debug("Epoch: %d, Step: %d" % (epoch, step)) batch_generator = batch_iterator.__iter__() # consuming seen batches from previous training for _ in range((epoch - 1) * steps_per_epoch, step): next(batch_generator) model.train(True) for batch in batch_generator: step += 1 step_elapsed += 1 # log.debug("In step count:%d"%(step)) input_variables, input_lengths = getattr(batch, 'src') target_variables = getattr(batch, 'tgt') loss = self._train_batch(input_variables, input_lengths.tolist(), target_variables, model, teacher_forcing_ratio) # Record average loss print_loss_total += loss epoch_loss_total += loss if step % self.print_every == 0 and step_elapsed > self.print_every: print_loss_avg = print_loss_total / self.print_every log_msg = 'Progress: %d%%, Train %s: Total %6.2f Avg %.4f' % ( step / total_steps * 100, self.loss.name, print_loss_total, print_loss_avg) print_loss_total = 0 log.info(log_msg) # Checkpoint if step % self.checkpoint_every == 0 or step == total_steps: Checkpoint(model=model, optimizer=self.optimizer, epoch=epoch, step=step, input_vocab=data.fields['src'].vocab, output_vocab=data.fields['tgt'].vocab).save(self.expt_dir) if step_elapsed == 0: continue epoch_loss_avg = epoch_loss_total / min(steps_per_epoch, step - start_step) epoch_loss_total = 0 log_msg = "Finished epoch %d: Train %s: %.4f" % (epoch, self.loss.name, epoch_loss_avg) if dev_data is not None: dev_loss, accuracy = self.evaluator.evaluate(model, dev_data) self.optimizer.update(dev_loss, epoch) log_msg += ", Dev %s: %.4f, Accuracy: %.4f" % (self.loss.name, dev_loss, accuracy) model.train(mode=True) else: self.optimizer.update(epoch_loss_avg, epoch) log.info(log_msg)