train_sampler = SubsetRandomSampler(train_index) val_sampler = SubsetRandomSampler(val_index) # Download the test data testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_val_test) # Data loaders, in the train and validation data we are using the samplers to subset the data trainloader = torch.utils.data.DataLoader( trainset, batch_size=10, sampler=train_sampler, num_workers=4, generator=torch.Generator().manual_seed(58)) valloader = torch.utils.data.DataLoader(valset, batch_size=10, sampler=val_sampler, num_workers=4) testloader = torch.utils.data.DataLoader(testset, batch_size=10, num_workers=4) # Set cuda as device if available device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(device) # Class of the VGG11 neural network, it has to inherit from nn.Module class VGG19_CIFAR10(nn.Module): def __init__(self): # Call super constructor of the class
def train_and_report_stats( config: ConfigSchema, rank: Rank = RANK_ZERO, ) -> Generator[Tuple[int, Optional[Stats], Stats, Optional[Stats]], None, None]: """Each epoch/pass, for each partition pair, loads in embeddings and edgelist from disk, runs HOGWILD training on them, and writes partitions back to disk. """ if config.verbose > 0: import pprint pprint.PrettyPrinter().pprint(config.to_dict()) log("Loading entity counts...") if maybe_old_entity_path(config.entity_path): log("WARNING: It may be that your entity path contains files using the " "old format. See D14241362 for how to update them.") entity_counts: Dict[str, List[int]] = {} for entity, econf in config.entities.items(): entity_counts[entity] = [] for part in range(econf.num_partitions): with open(os.path.join( config.entity_path, "entity_count_%s_%d.txt" % (entity, part) ), "rt") as tf: entity_counts[entity].append(int(tf.read().strip())) # Figure out how many lhs and rhs partitions we need nparts_lhs, lhs_partitioned_types = get_partitioned_types(config, Side.LHS) nparts_rhs, rhs_partitioned_types = get_partitioned_types(config, Side.RHS) vlog("nparts %d %d types %s %s" % (nparts_lhs, nparts_rhs, lhs_partitioned_types, rhs_partitioned_types)) total_buckets = nparts_lhs * nparts_rhs sync: AbstractSynchronizer bucket_scheduler: AbstractBucketScheduler parameter_sharer: Optional[ParameterSharer] partition_client: Optional[PartitionClient] if config.num_machines > 1: if not 0 <= rank < config.num_machines: raise RuntimeError("Invalid rank for trainer") if not td.is_available(): raise RuntimeError("The installed PyTorch version doesn't provide " "distributed training capabilities.") ranks = ProcessRanks.from_num_invocations( config.num_machines, config.num_partition_servers) if rank == RANK_ZERO: log("Setup lock server...") start_server( LockServer( num_clients=len(ranks.trainers), nparts_lhs=nparts_lhs, nparts_rhs=nparts_rhs, lock_lhs=len(lhs_partitioned_types) > 0, lock_rhs=len(rhs_partitioned_types) > 0, init_tree=config.distributed_tree_init_order, ), server_rank=ranks.lock_server, world_size=ranks.world_size, init_method=config.distributed_init_method, groups=[ranks.trainers], ) bucket_scheduler = DistributedBucketScheduler( server_rank=ranks.lock_server, client_rank=ranks.trainers[rank], ) log("Setup param server...") start_server( ParameterServer(num_clients=len(ranks.trainers)), server_rank=ranks.parameter_servers[rank], init_method=config.distributed_init_method, world_size=ranks.world_size, groups=[ranks.trainers], ) parameter_sharer = ParameterSharer( client_rank=ranks.parameter_clients[rank], all_server_ranks=ranks.parameter_servers, init_method=config.distributed_init_method, world_size=ranks.world_size, groups=[ranks.trainers], ) if config.num_partition_servers == -1: start_server( ParameterServer(num_clients=len(ranks.trainers)), server_rank=ranks.partition_servers[rank], world_size=ranks.world_size, init_method=config.distributed_init_method, groups=[ranks.trainers], ) if len(ranks.partition_servers) > 0: partition_client = PartitionClient(ranks.partition_servers) else: partition_client = None groups = init_process_group( rank=ranks.trainers[rank], world_size=ranks.world_size, init_method=config.distributed_init_method, groups=[ranks.trainers], ) trainer_group, = groups sync = DistributedSynchronizer(trainer_group) dlog = log else: sync = DummySynchronizer() bucket_scheduler = SingleMachineBucketScheduler( nparts_lhs, nparts_rhs, config.bucket_order) parameter_sharer = None partition_client = None dlog = lambda msg: None # fork early for HOGWILD threads log("Creating workers...") num_workers = get_num_workers(config.workers) pool = create_pool(num_workers) def make_optimizer(params: Iterable[torch.nn.Parameter], is_emb: bool) -> Optimizer: params = list(params) if len(params) == 0: optimizer = DummyOptimizer() elif is_emb: optimizer = RowAdagrad(params, lr=config.lr) else: if config.relation_lr is not None: lr = config.relation_lr else: lr = config.lr optimizer = Adagrad(params, lr=lr) optimizer.share_memory() return optimizer # background_io is only supported in single-machine mode background_io = config.background_io and config.num_machines == 1 checkpoint_manager = CheckpointManager( config.checkpoint_path, background=background_io, rank=rank, num_machines=config.num_machines, partition_client=partition_client, ) checkpoint_manager.register_metadata_provider(ConfigMetadataProvider(config)) checkpoint_manager.write_config(config) iteration_manager = IterationManager( config.num_epochs, config.edge_paths, config.num_edge_chunks, iteration_idx=checkpoint_manager.checkpoint_version) checkpoint_manager.register_metadata_provider(iteration_manager) if config.init_path is not None: loadpath_manager = CheckpointManager(config.init_path) else: loadpath_manager = None def load_embeddings( entity: EntityName, part: Partition, strict: bool = False, force_dirty: bool = False, ) -> Tuple[torch.nn.Parameter, Optional[OptimizerStateDict]]: if strict: embs, optim_state = checkpoint_manager.read(entity, part, force_dirty=force_dirty) else: # Strict is only false during the first iteration, because in that # case the checkpoint may not contain any data (unless a previous # run was resumed) so we fall back on initial values. embs, optim_state = checkpoint_manager.maybe_read(entity, part, force_dirty=force_dirty) if embs is None and loadpath_manager is not None: embs, optim_state = loadpath_manager.maybe_read(entity, part) if embs is None: embs, optim_state = init_embs(entity, entity_counts[entity][part], config.dimension, config.init_scale) assert embs.is_shared() return torch.nn.Parameter(embs), optim_state log("Initializing global model...") model = make_model(config) trainer = Trainer( global_optimizer=make_optimizer(model.parameters(), False), loss_fn=config.loss_fn, margin=config.margin, relations=config.relations, ) evaluator = TrainingRankingEvaluator( override_num_batch_negs=config.eval_num_batch_negs, override_num_uniform_negs=config.eval_num_uniform_negs, ) eval_batch_size = round_up_to_nearest_multiple(config.batch_size, config.eval_num_batch_negs) state_dict, optim_state = checkpoint_manager.maybe_read_model() if state_dict is None and loadpath_manager is not None: state_dict, optim_state = loadpath_manager.maybe_read_model() if state_dict is not None: model.load_state_dict(state_dict, strict=False) if optim_state is not None: trainer.global_optimizer.load_state_dict(optim_state) vlog("Loading unpartitioned entities...") for entity, econfig in config.entities.items(): if econfig.num_partitions == 1: embs, optim_state = load_embeddings(entity, Partition(0)) model.set_embeddings(entity, embs, Side.LHS) model.set_embeddings(entity, embs, Side.RHS) optimizer = make_optimizer([embs], True) if optim_state is not None: optimizer.load_state_dict(optim_state) trainer.entity_optimizers[(entity, Partition(0))] = optimizer # start communicating shared parameters with the parameter server if parameter_sharer is not None: parameter_sharer.share_model_params(model) strict = False def swap_partitioned_embeddings( old_b: Optional[Bucket], new_b: Optional[Bucket], ): # 0. given the old and new buckets, construct data structures to keep # track of old and new embedding (entity, part) tuples io_bytes = 0 log("Swapping partitioned embeddings %s %s" % (old_b, new_b)) types = ([(e, Side.LHS) for e in lhs_partitioned_types] + [(e, Side.RHS) for e in rhs_partitioned_types]) old_parts = {(e, old_b.get_partition(side)): side for e, side in types if old_b is not None} new_parts = {(e, new_b.get_partition(side)): side for e, side in types if new_b is not None} to_checkpoint = set(old_parts) - set(new_parts) preserved = set(old_parts) & set(new_parts) # 1. checkpoint embeddings that will not be used in the next pair # if old_b is not None: # there are previous embeddings to checkpoint log("Writing partitioned embeddings") for entity, part in to_checkpoint: side = old_parts[(entity, part)] vlog("Checkpointing (%s %d %s)" % (entity, part, side.pick("lhs", "rhs"))) embs = model.get_embeddings(entity, side) optim_key = (entity, part) optim_state = OptimizerStateDict(trainer.entity_optimizers[optim_key].state_dict()) io_bytes += embs.nelement() * 4 # ignore optim state checkpoint_manager.write(entity, part, embs.detach(), optim_state) if optim_key in trainer.entity_optimizers: del trainer.entity_optimizers[optim_key] # these variables are holding large objects; let them be freed del embs del optim_state bucket_scheduler.release_bucket(old_b) # 2. copy old embeddings that will be used in the next pair # into a temporary dictionary # tmp_emb = {x: model.get_embeddings(x[0], old_parts[x]) for x in preserved} for entity, _ in types: model.clear_embeddings(entity, Side.LHS) model.clear_embeddings(entity, Side.RHS) if new_b is None: # there are no new embeddings to load return io_bytes # 3. load new embeddings into the model/optimizer, either from disk # or the temporary dictionary # log("Loading entities") for entity, side in types: part = new_b.get_partition(side) part_key = (entity, part) if part_key in tmp_emb: vlog("Loading (%s, %d) from preserved" % (entity, part)) embs, optim_state = tmp_emb[part_key], None else: vlog("Loading (%s, %d)" % (entity, part)) force_dirty = bucket_scheduler.check_and_set_dirty(entity, part) embs, optim_state = load_embeddings( entity, part, strict=strict, force_dirty=force_dirty) io_bytes += embs.nelement() * 4 # ignore optim state model.set_embeddings(entity, embs, side) tmp_emb[part_key] = embs optim_key = (entity, part) if optim_key not in trainer.entity_optimizers: vlog("Resetting optimizer %s" % (optim_key,)) optimizer = make_optimizer([embs], True) if optim_state is not None: vlog("Setting optim state") optimizer.load_state_dict(optim_state) trainer.entity_optimizers[optim_key] = optimizer return io_bytes # Start of the main training loop. for epoch_idx, edge_path_idx, edge_chunk_idx \ in iteration_manager.remaining_iterations(): log("Starting epoch %d / %d edge path %d / %d edge chunk %d / %d" % (epoch_idx + 1, iteration_manager.num_epochs, edge_path_idx + 1, iteration_manager.num_edge_paths, edge_chunk_idx + 1, iteration_manager.num_edge_chunks)) edge_reader = EdgeReader(iteration_manager.edge_path) log("edge_path= %s" % iteration_manager.edge_path) sync.barrier() dlog("Lock client new epoch...") bucket_scheduler.new_pass(is_first=iteration_manager.iteration_idx == 0) sync.barrier() remaining = total_buckets cur_b = None while remaining > 0: old_b = cur_b io_time = 0. io_bytes = 0 cur_b, remaining = bucket_scheduler.acquire_bucket() print('still in queue: %d' % remaining, file=sys.stderr) if cur_b is None: if old_b is not None: # if you couldn't get a new pair, release the lock # to prevent a deadlock! tic = time.time() io_bytes += swap_partitioned_embeddings(old_b, None) io_time += time.time() - tic time.sleep(1) # don't hammer td continue def log_status(msg, always=False): f = log if always else vlog f("%s: %s" % (cur_b, msg)) tic = time.time() io_bytes += swap_partitioned_embeddings(old_b, cur_b) current_index = \ (iteration_manager.iteration_idx + 1) * total_buckets - remaining next_b = bucket_scheduler.peek() if next_b is not None and background_io: # Ensure the previous bucket finished writing to disk. checkpoint_manager.wait_for_marker(current_index - 1) log_status("Prefetching") for entity in lhs_partitioned_types: checkpoint_manager.prefetch(entity, next_b.lhs) for entity in rhs_partitioned_types: checkpoint_manager.prefetch(entity, next_b.rhs) checkpoint_manager.record_marker(current_index) log_status("Loading edges") lhs, rhs, rel = edge_reader.read( cur_b.lhs, cur_b.rhs, edge_chunk_idx, config.num_edge_chunks) num_edges = rel.size(0) # this might be off in the case of tensorlist io_bytes += (lhs.nelement() + rhs.nelement() + rel.nelement()) * 4 log_status("Shuffling edges") # Fix a seed to get the same permutation every time; have it # depend on all and only what affects the set of edges. g = torch.Generator() g.manual_seed(hash((edge_path_idx, edge_chunk_idx, cur_b.lhs, cur_b.rhs))) num_eval_edges = int(num_edges * config.eval_fraction) if num_eval_edges > 0: edge_perm = torch.randperm(num_edges, generator=g) eval_edge_perm = edge_perm[-num_eval_edges:] num_edges -= num_eval_edges edge_perm = edge_perm[torch.randperm(num_edges)] else: edge_perm = torch.randperm(num_edges) # HOGWILD evaluation before training eval_stats_before: Optional[Stats] = None if num_eval_edges > 0: log_status("Waiting for workers to perform evaluation") all_eval_stats_before = pool.map(call, [ partial( process_in_batches, batch_size=eval_batch_size, model=model, batch_processor=evaluator, lhs=lhs, rhs=rhs, rel=rel, indices=eval_edge_perm[s], ) for s in split_almost_equally(eval_edge_perm.size(0), num_parts=num_workers) ]) eval_stats_before = Stats.sum(all_eval_stats_before).average() log("stats before %s: %s" % (cur_b, eval_stats_before)) io_time += time.time() - tic tic = time.time() # HOGWILD training log_status("Waiting for workers to perform training") # FIXME should we only delay if iteration_idx == 0? all_stats = pool.map(call, [ partial( process_in_batches, batch_size=config.batch_size, model=model, batch_processor=trainer, lhs=lhs, rhs=rhs, rel=rel, indices=edge_perm[s], delay=config.hogwild_delay if epoch_idx == 0 and rank > 0 else 0, ) for rank, s in enumerate(split_almost_equally(edge_perm.size(0), num_parts=num_workers)) ]) stats = Stats.sum(all_stats).average() compute_time = time.time() - tic log_status( "bucket %d / %d : Processed %d edges in %.2f s " "( %.2g M/sec ); io: %.2f s ( %.2f MB/sec )" % (total_buckets - remaining, total_buckets, lhs.size(0), compute_time, lhs.size(0) / compute_time / 1e6, io_time, io_bytes / io_time / 1e6), always=True) log_status("%s" % stats, always=True) # HOGWILD eval after training eval_stats_after: Optional[Stats] = None if num_eval_edges > 0: log_status("Waiting for workers to perform evaluation") all_eval_stats_after = pool.map(call, [ partial( process_in_batches, batch_size=eval_batch_size, model=model, batch_processor=evaluator, lhs=lhs, rhs=rhs, rel=rel, indices=eval_edge_perm[s], ) for s in split_almost_equally(eval_edge_perm.size(0), num_parts=num_workers) ]) eval_stats_after = Stats.sum(all_eval_stats_after).average() log("stats after %s: %s" % (cur_b, eval_stats_after)) # Add train/eval metrics to queue yield current_index, eval_stats_before, stats, eval_stats_after swap_partitioned_embeddings(cur_b, None) # Distributed Processing: all machines can leave the barrier now. sync.barrier() # Write metadata: for multiple machines, write from rank-0 log("Finished epoch %d path %d pass %d; checkpointing global state." % (epoch_idx + 1, edge_path_idx + 1, edge_chunk_idx + 1)) log("My rank: %d" % rank) if rank == 0: for entity, econfig in config.entities.items(): if econfig.num_partitions == 1: embs = model.get_embeddings(entity, Side.LHS) optimizer = trainer.entity_optimizers[(entity, Partition(0))] checkpoint_manager.write( entity, Partition(0), embs.detach(), OptimizerStateDict(optimizer.state_dict())) sanitized_state_dict: ModuleStateDict = {} for k, v in ModuleStateDict(model.state_dict()).items(): if k.startswith('lhs_embs') or k.startswith('rhs_embs'): # skipping state that's an entity embedding continue sanitized_state_dict[k] = v log("Writing metadata...") checkpoint_manager.write_model( sanitized_state_dict, OptimizerStateDict(trainer.global_optimizer.state_dict()), ) log("Writing the checkpoint...") checkpoint_manager.write_new_version(config) dlog("Waiting for other workers to write their parts of the checkpoint: rank %d" % rank) sync.barrier() dlog("All parts of the checkpoint have been written") log("Switching to new checkpoint version...") checkpoint_manager.switch_to_new_version() dlog("Waiting for other workers to switch to the new checkpoint version: rank %d" % rank) sync.barrier() dlog("All workers have switched to the new checkpoint version") # After all the machines have finished committing # checkpoints, we remove the old checkpoints. checkpoint_manager.remove_old_version(config) # now we're sure that all partition files exist, # so be strict about loading them strict = True # quiescence pool.close() pool.join() sync.barrier() checkpoint_manager.close() if loadpath_manager is not None: loadpath_manager.close() # FIXME join distributed workers (not really necessary) log("Exiting")
train_dataset = jsonDataset(path=config['data']['train'].split(' ')[0], classes=target_classes) valid_dataset = jsonDataset(path=config['data']['valid'].split(' ')[0], classes=target_classes) elif config['data']['name'] == 'landmark': train_data = Landmark_dataset( root='/data/kaggle/dacon_landmark_korea/public', is_train=True) num_classes = train_data.num_classes num_data = len(train_data) num_train = int(num_data * 0.7) num_valid = num_data - num_train train_dataset, valid_dataset = torch.utils.data.random_split( dataset=train_data, lengths=[num_train, num_valid], generator=torch.Generator().manual_seed(config['params']['seed'])) else: raise NotImplementedError('Unsupported Dataset: ' + str(config['data']['name'])) assert train_dataset assert valid_dataset '''loss''' # criterion = nn.CrossEntropyLoss(reduction='mean') criterion = nn.KLDivLoss(reduction='batchmean') '''print out''' print("transform : " + str(transform_train)) print("num. train data : " + str(len(train_dataset))) print("num. valid data : " + str(len(valid_dataset))) print("num_classes : " + str(num_classes))
def train( env: gym.Env, test_env: gym.Env, termination_fn: mbrl.types.TermFnType, cfg: omegaconf.DictConfig, silent: bool = False, work_dir: Optional[str] = None, ) -> np.float32: # ------------------- Initialization ------------------- debug_mode = cfg.get("debug_mode", False) obs_shape = env.observation_space.shape act_shape = env.action_space.shape mbrl.planning.complete_agent_cfg(env, cfg.algorithm.agent) agent = hydra.utils.instantiate(cfg.algorithm.agent) work_dir = work_dir or os.getcwd() # enable_back_compatible to use pytorch_sac agent logger = mbrl.util.Logger(work_dir, enable_back_compatible=True) logger.register_group( mbrl.constants.RESULTS_LOG_NAME, MBPO_LOG_FORMAT, color="green", dump_frequency=1, ) video_recorder = pytorch_sac.VideoRecorder( work_dir if cfg.save_video else None) rng = np.random.default_rng(seed=cfg.seed) torch_generator = torch.Generator(device=cfg.device) if cfg.seed is not None: torch_generator.manual_seed(cfg.seed) # -------------- Create initial overrides. dataset -------------- dynamics_model = mbrl.util.common.create_one_dim_tr_model( cfg, obs_shape, act_shape) replay_buffer = mbrl.util.common.create_replay_buffer(cfg, obs_shape, act_shape, rng=rng) random_explore = cfg.algorithm.random_initial_explore mbrl.util.common.rollout_agent_trajectories( env, cfg.algorithm.initial_exploration_steps, mbrl.planning.RandomAgent(env) if random_explore else agent, {} if random_explore else { "sample": True, "batched": False }, replay_buffer=replay_buffer, ) # --------------------------------------------------------- # --------------------- Training Loop --------------------- rollout_batch_size = (cfg.overrides.effective_model_rollouts_per_step * cfg.algorithm.freq_train_model) trains_per_epoch = int( np.ceil(cfg.overrides.epoch_length / cfg.overrides.freq_train_model)) updates_made = 0 env_steps = 0 model_env = mbrl.models.ModelEnv(env, dynamics_model, termination_fn, None, generator=torch_generator) model_trainer = mbrl.models.ModelTrainer( dynamics_model, optim_lr=cfg.overrides.model_lr, weight_decay=cfg.overrides.model_wd, logger=None if silent else logger, ) best_eval_reward = -np.inf epoch = 0 sac_buffer = None while env_steps < cfg.overrides.num_steps: rollout_length = int( mbrl.util.math.truncated_linear(*(cfg.overrides.rollout_schedule + [epoch + 1]))) sac_buffer_capacity = rollout_length * rollout_batch_size * trains_per_epoch sac_buffer_capacity *= cfg.overrides.num_epochs_to_retain_sac_buffer sac_buffer = maybe_replace_sac_buffer( sac_buffer, sac_buffer_capacity, obs_shape, act_shape, torch.device(cfg.device), ) obs, done = None, False for steps_epoch in range(cfg.overrides.epoch_length): if steps_epoch == 0 or done: obs, done = env.reset(), False # --- Doing env step and adding to model dataset --- next_obs, reward, done, _ = mbrl.util.common.step_env_and_add_to_buffer( env, obs, agent, {}, replay_buffer) # --------------- Model Training ----------------- if (env_steps + 1) % cfg.overrides.freq_train_model == 0: mbrl.util.common.train_model_and_save_model_and_data( dynamics_model, model_trainer, cfg.overrides, replay_buffer, work_dir=work_dir, ) # --------- Rollout new model and store imagined trajectories -------- # Batch all rollouts for the next freq_train_model steps together rollout_model_and_populate_sac_buffer( model_env, replay_buffer, agent, sac_buffer, cfg.algorithm.sac_samples_action, rollout_length, rollout_batch_size, ) if debug_mode: print(f"Epoch: {epoch}. " f"SAC buffer size: {len(sac_buffer)}. " f"Rollout length: {rollout_length}. " f"Steps: {env_steps}") # --------------- Agent Training ----------------- for _ in range(cfg.overrides.num_sac_updates_per_step): if (env_steps + 1) % cfg.overrides.sac_updates_every_steps != 0 or len( sac_buffer) < rollout_batch_size: break # only update every once in a while agent.update(sac_buffer, logger, updates_made) updates_made += 1 if not silent and updates_made % cfg.log_frequency_agent == 0: logger.dump(updates_made, save=True) # ------ Epoch ended (evaluate and save model) ------ if (env_steps + 1) % cfg.overrides.epoch_length == 0: avg_reward = evaluate(test_env, agent, cfg.algorithm.num_eval_episodes, video_recorder) logger.log_data( mbrl.constants.RESULTS_LOG_NAME, { "epoch": epoch, "env_step": env_steps, "episode_reward": avg_reward, "rollout_length": rollout_length, }, ) if avg_reward > best_eval_reward: video_recorder.save(f"{epoch}.mp4") best_eval_reward = avg_reward torch.save(agent.critic.state_dict(), os.path.join(work_dir, "critic.pth")) torch.save(agent.actor.state_dict(), os.path.join(work_dir, "actor.pth")) epoch += 1 env_steps += 1 obs = next_obs return np.float32(best_eval_reward)
import torch import torch.nn as nn from torch.utils.data import random_split from torchvision.datasets import MNIST from torchvision.transforms import ToTensor from poutyne import Experiment # Instanciate the MNIST dataset train_valid_dataset = MNIST('./datasets', train=True, download=True, transform=ToTensor()) test_dataset = MNIST('./datasets', train=False, download=True, transform=ToTensor()) train_dataset, valid_dataset = random_split( train_valid_dataset, [50_000, 10_000], generator=torch.Generator().manual_seed(42) ) # Select CUDA device if available cuda_device = 0 device = torch.device('cuda:%d' % cuda_device if torch.cuda.is_available() else 'cpu') # Define the network network = nn.Sequential(nn.Flatten(), nn.Linear(28 * 28, 100), nn.ReLU(), nn.Linear(100, 10)) epochs = 5 # Define the Experiment and train experiment = Experiment( './simple_model', # Where to log network, optimizer='sgd', loss_function='cross_entropy', device=device, ) experiment.train_dataset(train_dataset, valid_dataset, epochs=epochs)
'image_size': [args.image_size, args.image_size], 'mean': mean, 'std': std, 'data_dir': args.data_path, 'is_trans': True, 'is_train': True } datasets = SegmentDataset(config) train_num = int(0.9 * len(datasets)) val_num = len(datasets) - train_num split_num = random.randint(0, 100) train_datasets, val_datasets = random_split( datasets, [train_num, val_num], generator=torch.Generator().manual_seed(split_num)) train_loader = DataLoader(train_datasets, batch_size=args.batch_size, shuffle=True, num_workers=args.workers_num, pin_memory=True) val_loader = DataLoader(val_datasets, batch_size=4, shuffle=False, num_workers=args.workers_num, pin_memory=True) # train_loader = DataLoader(datasets, batch_size=args.batch_size, shuffle=True, # num_workers=args.workers_num, pin_memory=True)
def test_random_sampler(setup_cluster): import torch data = mt.random.rand(1000, 32, dtype='f4') labels = mt.random.randint(0, 2, (1000, 10), dtype='f4') train_dataset = MarsDataset(data, labels) # test __init__() with pytest.raises(ValueError) as e: train_sampler = RandomSampler(train_dataset, replacement=1) exec_msg = e.value.args[0] assert exec_msg == "replacement should be a boolean value, but got replacement=1" with pytest.raises(ValueError) as e: train_sampler = RandomSampler(train_dataset, num_samples=900) exec_msg = e.value.args[0] assert exec_msg == "With replacement=False, num_samples should not " + \ "be specified, since a random permute will be performed." with pytest.raises(ValueError) as e: train_sampler = RandomSampler(train_dataset, replacement=True, num_samples=-1) exec_msg = e.value.args[0] assert exec_msg == "num_samples should be a positive integer value, but got num_samples=-1" train_sampler = RandomSampler(train_dataset) # test __len__ num_samples() assert len(train_sampler) == 1000 assert train_sampler.num_samples == 1000 # test __iter__ g_cpu = torch.Generator() g_cpu.manual_seed(2147483647) train_sampler = RandomSampler(train_dataset, generator=g_cpu) assert len(train_sampler) == 1000 train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=32, sampler=train_sampler) for _, (batch_data, batch_labels) in enumerate(train_loader): assert len(batch_data[0]) == 32 assert len(batch_labels[0]) == 10 train_sampler = RandomSampler(train_dataset, replacement=True, num_samples=900) assert len(train_sampler) == 900 train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=32, sampler=train_sampler) for _, (batch_data, batch_labels) in enumerate(train_loader): assert len(batch_data[0]) == 32 assert len(batch_labels[0]) == 10 # torch train model = torch.nn.Sequential( torch.nn.Linear(32, 64), torch.nn.ReLU(), torch.nn.Linear(64, 64), torch.nn.ReLU(), torch.nn.Linear(64, 10), torch.nn.Softmax(dim=1), ) optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5) criterion = torch.nn.BCELoss() for _ in range(2): # 2 epochs for _, (batch_data, batch_labels) in enumerate(train_loader): outputs = model(batch_data) loss = criterion(outputs.squeeze(), batch_labels) optimizer.zero_grad() loss.backward() optimizer.step()
def run(self, progressbar=False): self.optimizer = self._make_optimizer(self._params) self.scheduler = self._make_scheduler(self.optimizer) if progressbar: progressbar = tqdm.tqdm(total=self.cycles * self.epochs_per_cycle, mininterval=2.0) assert progressbar is False or isinstance(progressbar, tqdm.std.tqdm) def _enter_epoch(desc, temperature): "Run this at the beginning of each epoch" if progressbar: progressbar.set_description(desc, refresh=False) for g in self.optimizer.param_groups: g['temperature'] = temperature def _is_sampling_epoch(_epoch): "Are we storing a sample at the end of this epoch?" _epoch = _epoch % self.epochs_per_cycle sampling_epoch = _epoch - (self.descent_epochs + self.warmup_epochs) return (0 <= sampling_epoch) and (sampling_epoch % self.skip == 0) # Use an exact gradient for the initial step and loss loss, log_prior, potential = self._exact_model_potential_and_grad( self.dataloader) self.optimizer.sample_momentum() self.optimizer.initial_step(calc_metrics=True, save_state=self.reject_samples) step = 0 self.store_metrics(i=step, loss=loss.item(), log_prior=log_prior.item(), potential=potential.item(), acc=0., lr=self.optimizer.param_groups[0]["lr"], corresponds_to_sample=True, delta_energy=0., total_energy=0., rejected=False) self._initial_potential = potential.item() self._total_energy = 0. assert self.dataloader.sampler.generator is None generator = self.dataloader.sampler.generator = torch.Generator() postfix = {} for cycle in range(self.cycles): generator.seed() cycle_random_state = generator.get_state() for epoch in range(self.epochs_per_cycle): if epoch < self.descent_epochs: _enter_epoch(f"Cycle {cycle}, epoch {epoch}, Descent", 0.) elif epoch - self.descent_epochs < self.warmup_epochs: _enter_epoch(f"Cycle {cycle}, epoch {epoch}, Warmup", self.temperature) else: _enter_epoch(f"Cycle {cycle}, epoch {epoch}, Sampling", self.temperature) # Run one epoch of potentially-stochastic gradient descent # make sure the epochs' data points are always in the same order for this cycle. generator.set_state(cycle_random_state) for i, (x, y) in enumerate(self.dataloader): step += 1 loss, log_prior, potential, acc = self._model_potential_and_grad( x.to(self._params[0].device), y.to(self._params[0].device)) store_metrics = (step % self.metrics_skip) == 0 self.optimizer.step(calc_metrics=store_metrics) if store_metrics: delta_energy = self.optimizer.delta_energy( self._initial_potential, potential) self.store_metrics( i=step, loss=loss.item(), log_prior=log_prior.item(), potential=potential.item(), acc=acc.item(), lr=self.optimizer.param_groups[0]["lr"], corresponds_to_sample=False, delta_energy=delta_energy, total_energy=self._total_energy + delta_energy) if progressbar: postfix["train/loss"] = loss.item() postfix["train/acc"] = acc.item() postfix["Δₑ"] = delta_energy progressbar.set_postfix(postfix, refresh=False) # Omit the scheduler step in the last iteration, because we # want to run it after `optimizer.final_step` if i < len(self.dataloader) - 1: self.scheduler.step() if _is_sampling_epoch(epoch): step += 1 # Do the sample's `final_step` using an exact gradient loss, log_prior, potential = self._exact_model_potential_and_grad( self.dataloader) self.optimizer.final_step(calc_metrics=True) delta_energy = self.optimizer.delta_energy( self._initial_potential, potential) self._total_energy += delta_energy self._initial_potential = potential.item() rejected = False if self.reject_samples: rejected, _ = self.optimizer.maybe_reject(delta_energy) self.store_metrics( i=step, loss=loss.item(), log_prior=log_prior.item(), potential=potential.item(), # TODO: do not use stale `acc`, calculate for full training set acc=acc.item(), lr=self.optimizer.param_groups[0]["lr"], corresponds_to_sample=True, delta_energy=delta_energy, total_energy=self._total_energy, rejected=rejected) # Evaluate test accuracy and save to disk the current sample # (correctly rolled back to the previous if rejected) state_dict = self.model.state_dict() eval_results = self._evaluate_model(state_dict, step) self._save_sample(state_dict, cycle, epoch, step) if progressbar: postfix.update(eval_results) postfix["train/loss"] = loss.item() postfix["Δₑ"] = delta_energy progressbar.set_postfix(postfix, refresh=False) self.scheduler.step() # First step for the next epoch, using the same gradient # but potentially a different learning rate if isinstance(self.optimizer, mcmc.HMC): self.optimizer.sample_momentum() self.optimizer.initial_step(calc_metrics=False, save_state=self.reject_samples) else: # Not an epoch that stores a sample at the end # Evaluate test accuracy every epoch eval_results = self._evaluate_model( self.model.state_dict(), step) if progressbar: postfix.update(eval_results) progressbar.set_postfix(postfix, refresh=False) self.scheduler.step() # Update preconditioner, increment progressbar at the end of the epoch if self.precond_update is not None and ( epoch + 1) % self.precond_update == 0: self.optimizer.update_preconditioner() # Important to put here because no new metrics are added # Write metrics to disk every 30 seconds self.metrics_saver.flush(every_s=30) if progressbar: progressbar.update(1) # Close the progressbar at the end of the training procedure if progressbar: progressbar.close()
def setup(self, stage=None): #Train set self.train_set = CIFAR10(self.params.PATH_DATASET, train=True, transform=self.transform_train) #Val and test set test_val_set = CIFAR10(self.params.PATH_DATASET, train=False, transform=self.transform_test) len_test_val_set = len(test_val_set) split = int(len_test_val_set/2) self.val_test, self.test_set = random_split(test_val_set, [split, split], generator=torch.Generator().manual_seed(42)) assert len(self.val_test) == len(self.test_set)
def seed(self, seed: Optional[int] = None) -> None: seed = create_seed(seed, max_bytes=7) self._torch_random = torch.Generator(device=self.device) self._torch_random.manual_seed(seed)
def forward(self, query: torch.Tensor, value: torch.Tensor, mask: torch.Tensor, seed: int, random=True): length = query.size(2) bucket_length = length // self.n_buckets query = query / torch.norm(query, dim=-1, keepdim=True) # [batch, head, length, d_k] flattened_query = query.flatten(0, 1) # [batch * head, length, d_k] hashes = self.lsh(flattened_query, random) # [batch * head, length, rounds] sorted_hashes, hash_indices = torch.sort(hashes, dim=1) # [batch * head, length, rounds] expanded_hash_indices = hash_indices[:, :, None, :].expand( -1, -1, self.d_k, -1) # [batch * head, length, d_k, rounds] expanded_query = flattened_query[..., None].expand(-1, -1, -1, self.rounds) # [batch * head, length, d_k, rounds] reordered_query = torch.gather(expanded_query, dim=1, index=expanded_hash_indices) # [batch * head, length, d_k, rounds] reordered_query = reordered_query.reshape(-1, self.n_buckets // 2, bucket_length * 2, self.d_k, self.rounds) # [batch * head, n_buckets // 2, bucket_length * 2, d_k, rounds] lookback_key = look_back(reordered_query) # [batch * head, n_buckets // 2, bucket_length * 4, d_k, rounds] scores = torch.einsum('...ijk,...ljk->...ilk', reordered_query, lookback_key) / math.sqrt(self.d_k) # [batch * head, n_buckets // 2, bucket_length * 2, bucket_length * 4, rounds] mask = mask[:, None, :, None].expand(-1, self.head, -1, self.rounds).flatten(0, 1) # [batch * head, length, rounds] reordered_mask = torch.gather(mask, dim=1, index=hash_indices) # [batch * head, length, rounds] reordered_mask = reordered_mask.reshape(-1, self.n_buckets // 2, bucket_length * 2, self.rounds) # [batch * head, n_buckets // 2, bucket_length * 2, rounds] lookback_mask = look_back(reordered_mask)[..., None, :, :] # [batch * head, n_buckets // 2, 1, bucket_length * 4, rounds] scores.masked_fill_(mask=~lookback_mask, value=-1e9) sorted_hashes = sorted_hashes.reshape(-1, self.n_buckets // 2, bucket_length * 2, self.rounds) # [batch * head, n_buckets // 2, bucket_length * 2, rounds] lookback_hash = look_back(sorted_hashes) # [batch * head, n_buckets // 2, bucket_length * 4, rounds] hash_equiv_mask = (sorted_hashes[..., None, :] != lookback_hash[..., None, :, :]) # [batch * head, n_buckets // 2, bucket_length * 2, bucket_length * 4, rounds] scores.masked_fill_(mask=hash_equiv_mask, value=-1e9) query_indices = hash_indices.reshape(-1, self.n_buckets // 2, bucket_length * 2, self.rounds) # [batch * head, n_buckets // 2, bucket_length * 2, rounds] key_indices = look_back(query_indices) # [batch * head, n_buckets // 2, bucket_length * 4, rounds] causal_mask = query_indices[..., None, :] < key_indices[..., None, :, :] # [batch * head, n_buckets // 2, bucket_length * 2, bucket_length * 4, rounds] scores.masked_fill_(mask=causal_mask, value=-1e9) indice_equiv_mask = query_indices[..., None, :] == key_indices[..., None, :, :] # [batch * head, n_buckets // 2, bucket_length * 2, bucket_length * 4, rounds] scores.masked_fill_(mask=indice_equiv_mask, value=-1e5) original_indices = reverse_sort(hash_indices, dim=1) # [batch * head, length, rounds] score_indices = original_indices[..., None, :].expand( -1, -1, bucket_length * 4, -1) # [batch * head, length, bucket_length * 4, rounds] expanded_key_indices = key_indices[..., None, :, :].expand( -1, -1, bucket_length * 2, -1, -1) # [batch * head, n_buckets // 2, bucket_length * 2, bucket_length * 4, rounds] reordered_key_indices = torch.gather(expanded_key_indices.flatten( 1, 2), dim=1, index=score_indices) # [batch * head, length, bucket_length * 4, rounds] flat_reordered_key = reordered_key_indices.flatten(-2, -1).flatten(0, 1) # [batch * head * length, bucket_length * 4 * rounds] sorted_flat_key, flat_key_indices = torch.sort( flat_reordered_key.int(), dim=-1) # [batch * head * length, bucket_length * 4 * rounds] count_shift_keys = torch.ones_like(sorted_flat_key).float() # [batch * head * length, bucket_length * 4 * rounds] for i in range(1, self.rounds): equiv_flat_key = ( sorted_flat_key[..., i:] == sorted_flat_key[..., :-i]).float() count_shift_keys[..., i:] += equiv_flat_key count_shift_keys[..., :-i] += equiv_flat_key count_key_indices = reverse_sort(flat_key_indices, dim=1) # [batch * head * length, bucket_length * 4 * rounds] count_key = torch.gather(count_shift_keys, dim=-1, index=count_key_indices) # [batch * head * length, bucket_length * 4 * rounds] reshaped_count_key = count_key.reshape(-1, length, bucket_length * 4, self.rounds) # [batch * head, length, bucket_length * 4, rounds] scores = scores.flatten(1, 2) # [batch * head, length, bucket_length * 4, rounds] scores = torch.gather(scores, dim=1, index=score_indices) # [batch * head, length, bucket_length * 4, rounds] scores = scores - reshaped_count_key.log().detach() scores = scores.flatten(-2, -1) # [batch * head, length, bucket_length * 4 * rounds] p_attn = F.softmax(scores, dim=-1) # [batch * head, length, bucket_length * 4 * rounds] if self.training: generator = torch.Generator(device=p_attn.get_device()) generator.manual_seed(seed) dropout_mask = torch.bernoulli(p_attn, p=1 - self.dropout, generator=generator) p_attn = dropout_mask * p_attn / (1 - self.dropout) p_attn = p_attn.reshape(-1, length, bucket_length * 4, self.rounds) # [batch * head, length, bucket_length * 4, rounds] flattened_value = value.flatten(0, 1)[..., None].expand( -1, -1, -1, self.rounds) # [batch * head, length, d_k, rounds] reordered_value = torch.gather(flattened_value, dim=1, index=expanded_hash_indices) # [batch * head, length, d_k, rounds] reshaped_value = reordered_value.reshape(-1, self.n_buckets // 2, bucket_length * 2, self.d_k, self.rounds) # [batch * head, n_buckets // 2, bucket_length * 2, d_k, rounds] lookback_value = look_back(reshaped_value) # [batch * head, n_buckets // 2, bucket_length * 4, d_k, rounds] attn_indices = hash_indices[..., None, :].expand(-1, -1, bucket_length * 4, -1) # [batch * head, length, bucket_length * 4, rounds] reordered_p_attn = torch.gather(p_attn, dim=1, index=attn_indices) # [batch * head, length, bucket_length * 4, rounds] new_p_attn = reordered_p_attn.reshape(-1, self.n_buckets // 2, bucket_length * 2, bucket_length * 4, self.rounds) # [batch * head, n_buckets // 2, bucket_length * 2, bucket_length * 4, rounds] attention = torch.einsum('...ijl,...jkl->...ikl', new_p_attn, lookback_value) # [batch * head, n_buckets // 2, bucket_length * 2, d_k, rounds] attention = attention.flatten(1, 2) # [batch * head, length, d_k, rounds] new_indices = original_indices[..., None, :].expand(-1, -1, self.d_k, -1) # [batch * head, length, d_k, rounds] attention = torch.gather(attention, dim=1, index=new_indices).sum(dim=-1) # [batch * head, length, d_k] attention = attention.reshape(-1, self.head, length, self.d_k) # [batch, head, length, d_k] return attention
def test_dist_optim(self): # local version module1 = MyModule() module2 = MyModule() params = [module1.get_w(), module2.get_w()] local_optim = optim.SGD(params, lr=0.05) old_w1 = module1.w.clone().detach() old_w2 = module2.w.clone().detach() g_cpu = torch.Generator() g_cpu.manual_seed(0) t1 = torch.rand((3, 3), requires_grad=True, generator=g_cpu) t2 = torch.rand((3, 3), requires_grad=True, generator=g_cpu) output1 = module1.forward(t2) output2 = module2.forward(output1) loss = torch.add(output2, t1).sum() loss.backward() local_optim.step() # distributed version owner1 = "worker%d" % ((self.rank + 1) % self.world_size) owner2 = "worker%d" % ((self.rank + 2) % self.world_size) remote_module1 = rpc.remote(owner1, MyModule) remote_module2 = rpc.remote(owner2, MyModule) remote_param1 = remote_method(MyModule.get_w, remote_module1) remote_param2 = remote_method(MyModule.get_w, remote_module2) old_w1_remote = remote_param1.to_here() # sanity check: local and remote initial weights should match self.assertEqual(old_w1, remote_param1.to_here()) self.assertEqual(old_w2, remote_param2.to_here()) dist_optim = DistributedOptimizer(optim.SGD, [remote_param1, remote_param2], lr=0.05) with dist_autograd.context() as context_id: g_cpu.manual_seed(0) t1 = torch.rand((3, 3), requires_grad=True, generator=g_cpu) t2 = torch.rand((3, 3), requires_grad=True, generator=g_cpu) output1 = rpc_async_method(MyModule.forward, remote_module1, t2) output2 = rpc_async_method(MyModule.forward, remote_module2, output1.wait()) loss = torch.add(output2.wait(), t1) dist_autograd.backward(context_id, [loss.sum()]) dist_optim.step(context_id) new_w1 = rpc_async_method(MyModule.get_w, remote_module1).wait() new_w2 = rpc_async_method(MyModule.get_w, remote_module2).wait() # ensure optimizer changed weights self.assertNotEqual(old_w1, new_w1) self.assertNotEqual(old_w2, new_w2) # ensure local equals remote self.assertEqual(new_w1, module1.get_w()) self.assertEqual(new_w2, module2.get_w())
def getRandomDataSets(): train,val = random_split(datasets.ImageFolder("data",data_transforms["train"],[850,150]),generator=torch.Generator().manual_seed(42)) image_datasets['train'] = train image_datasets['val'] = val image_datasets['example'] = datasets.ImageFolder('example', data_transforms['example']) return image_datasets
def _setup_prng(): """ Generate shared random seeds to generate pseudo-random sharings of zero. For each device, we generator four random seeds: "prev" - shared seed with the previous party "next" - shared seed with the next party "local" - seed known only to the local party (separate from torch's default seed to prevent interference from torch.manual_seed) "global"- seed shared by all parties The "prev" and "next" random seeds are shared such that each process shares one seed with the previous rank process and one with the next rank. This allows for the generation of `n` random values, each known to exactly two of the `n` parties. For arithmetic sharing, one of these parties will add the number while the other subtracts it, allowing for the generation of a pseudo-random sharing of zero. (This can be done for binary sharing using bitwise-xor rather than addition / subtraction) """ global generators # Initialize RNG Generators for key in generators.keys(): generators[key][torch.device("cpu")] = torch.Generator( device=torch.device("cpu")) if torch.cuda.is_available(): cuda_device_names = ["cuda"] for i in range(torch.cuda.device_count()): cuda_device_names.append(f"cuda:{i}") cuda_devices = [torch.device(name) for name in cuda_device_names] for device in cuda_devices: for key in generators.keys(): generators[key][device] = torch.Generator(device=device) # Generate random seeds for Generators # NOTE: Chosen seed can be any number, but we choose as a random 64-bit # integer here so other parties cannot guess its value. We use os.urandom(8) # here to generate seeds so that forked processes do not generate the same seed. # Generate next / prev seeds. seed = int.from_bytes(os.urandom(8), "big") - 2**63 next_seed = torch.tensor(seed) prev_seed = torch.tensor([0], dtype=torch.long) # populated by irecv # Send random seed to next party, receive random seed from prev party world_size = comm.get().get_world_size() rank = comm.get().get_rank() if world_size >= 2: # Guard against segfaults when world_size == 1. next_rank = (rank + 1) % world_size prev_rank = (next_rank - 2) % world_size req0 = comm.get().isend(next_seed, next_rank) req1 = comm.get().irecv(prev_seed, src=prev_rank) req0.wait() req1.wait() else: prev_seed = next_seed prev_seed = prev_seed.item() next_seed = next_seed.item() # Create local seed - Each party has a separate local generator local_seed = int.from_bytes(os.urandom(8), "big") - 2**63 # Create global generator - All parties share one global generator for sync'd rng global_seed = int.from_bytes(os.urandom(8), "big") - 2**63 global_seed = torch.tensor(global_seed) global_seed = comm.get().broadcast(global_seed, 0).item() # Create one of each seed per party # Note: This is configured to coordinate seeds across cuda devices # so that we can one party per gpu. If we want to support configurations # where each party runs on multiple gpu's across machines, we will # need to modify this. for device in generators["prev"].keys(): generators["prev"][device].manual_seed(prev_seed) generators["next"][device].manual_seed(next_seed) generators["local"][device].manual_seed(local_seed) generators["global"][device].manual_seed(global_seed)
def __init__(self, p_stop=0.01, max_length=1000): self.p_stop = p_stop self.max_length = max_length self.generator = torch.Generator()
def train_with_val(net, optimizer, criterion, num_epochs, obj_loss_history: List[List], attr_loss_history: List[List], batch_size, dataset, curr_epoch=0, use_tune=False, model_dir: str = None) -> None: """ Train the model with validation set. Parameters: [obj/attr]_loss_history: nested list of length 2. history[0] the training loss history and history[1] the validation loss history. curr_epoch: the epoch number the model already been trained for. model_dir: directory to save model states. """ test_abs = int(len(dataset) * 0.8) train_subset, val_subset = random_split( dataset, [test_abs, len(dataset) - test_abs], generator=torch.Generator().manual_seed(42)) train_dataloader = DataLoader(train_subset, batch_size=batch_size, shuffle=True) val_dataloader = DataLoader(val_subset, batch_size=batch_size, shuffle=True) for epoch in range(curr_epoch, curr_epoch + num_epochs): epoch_steps = 0 obj_running_loss = 0.0 attr_running_loss = 0.0 net.train() # ==== Training ==== for i, batch in tqdm.tqdm(enumerate(train_dataloader), total=len(train_dataloader), disable=use_tune, position=0, leave=True, postfix='Train: epoch %d/%d' % (epoch, curr_epoch + num_epochs)): optimizer.zero_grad() img, attr_id, obj_id = batch[:3] if len(img) == 1: # Batchnorm doesn't accept batch with size 1 continue obj_pred, attr_pred = net(img.to(dev)) obj_loss = criterion(obj_pred, obj_id.to(dev)) attr_loss = criterion(attr_pred, attr_id.to(dev)) loss = obj_loss + attr_loss loss.backward() optimizer.step() obj_running_loss += obj_loss.item() attr_running_loss += attr_loss.item() epoch_steps += 1 if i % 100 == 99: print("[%d, %5d] obj_loss: %.3f, attr_loss: %.3f" % (epoch + 1, i + 1, obj_running_loss / epoch_steps, attr_running_loss / epoch_steps)) obj_loss_history[0].append(obj_running_loss / epoch_steps) attr_loss_history[0].append(attr_running_loss / epoch_steps) running_loss = 0.0 # ==== Validation ==== obj_val_loss = 0.0 attr_val_loss = 0.0 val_steps = 0 net.eval() for i, batch in tqdm.tqdm(enumerate(val_dataloader), total=len(val_dataloader), disable=use_tune, position=0, leave=True): with torch.no_grad(): img, attr_id, obj_id = batch[:3] obj_pred, attr_pred = net(img.to(dev)) obj_loss = criterion(obj_pred, obj_id.to(dev)) attr_loss = criterion(attr_pred, attr_id.to(dev)) obj_val_loss += obj_loss.cpu().numpy() attr_val_loss += attr_loss.cpu().numpy() val_steps += 1 obj_val_loss /= val_steps attr_val_loss /= val_steps print("[%d] obj_val_loss: %.3f, attr_val_loss: %.3f" % (epoch + 1, obj_val_loss, attr_val_loss)) obj_loss_history[1].append(obj_val_loss) attr_loss_history[1].append(attr_val_loss) # ==== Save model, report to tune ==== if use_tune: with tune.checkpoint_dir(epoch) as checkpoint_dir: path = os.path.join(checkpoint_dir, "checkpoint") torch.save( { 'model_state_dict': net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'obj_loss': obj_loss_history, 'attr_loss': attr_loss_history, }, path) acc = calc_acc(net, val_dataloader, use_tune) tune.report(loss=(obj_val_loss + attr_val_loss), accuracy=acc) print("accuracy: ", acc) else: if model_dir: model_path = os.path.join(model_dir, f"model_{epoch}.pt") torch.save( { 'model_state_dict': net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'obj_loss': obj_loss_history, 'attr_loss': attr_loss_history, }, model_path) old_model = os.path.join(model_dir, f"model_{epoch-1}.pt") if os.path.isfile(old_model): os.remove(old_model) print("Finished training.")
def train_and_report_stats( self ) -> Generator[Tuple[int, Optional[Stats], Stats, Optional[Stats]], None, None]: holder = self.holder config = self.config iteration_manager = self.iteration_manager total_buckets = holder.nparts_lhs * holder.nparts_rhs # yield stats from checkpoint, to reconstruct # saved part of the learning curve if self.rank == SINGLE_TRAINER: for stats_dict in self.checkpoint_manager.maybe_read_stats(): index: int = stats_dict["index"] stats: Stats = Stats.from_dict(stats_dict["stats"]) eval_stats_before: Optional[Stats] = None if "eval_stats_before" in stats_dict: eval_stats_before = Stats.from_dict( stats_dict["eval_stats_before"]) eval_stats_after: Optional[Stats] = None if "eval_stats_after" in stats_dict: eval_stats_after = Stats.from_dict( stats_dict["eval_stats_after"]) yield (index, eval_stats_before, stats, eval_stats_after) for epoch_idx, edge_path_idx, edge_chunk_idx in iteration_manager: logger.info( f"Starting epoch {epoch_idx + 1} / {iteration_manager.num_epochs}, " f"edge path {edge_path_idx + 1} / {iteration_manager.num_edge_paths}, " f"edge chunk {edge_chunk_idx + 1} / {iteration_manager.num_edge_chunks}" ) edge_storage = EDGE_STORAGES.make_instance( iteration_manager.edge_path) logger.info(f"Edge path: {iteration_manager.edge_path}") self._barrier() dist_logger.info("Lock client new epoch...") self.bucket_scheduler.new_pass( is_first=iteration_manager.iteration_idx == 0) self._barrier() remaining = total_buckets cur_b: Optional[Bucket] = None cur_stats: Optional[BucketStats] = None while remaining > 0: old_b: Optional[Bucket] = cur_b old_stats: Optional[BucketStats] = cur_stats cur_b, remaining = self.bucket_scheduler.acquire_bucket() logger.info(f"still in queue: {remaining}") if cur_b is None: cur_stats = None if old_b is not None: # if you couldn't get a new pair, release the lock # to prevent a deadlock! tic = time.perf_counter() release_bytes = self._swap_partitioned_embeddings( old_b, None, old_stats) release_time = time.perf_counter() - tic logger.info( f"Swapping old embeddings to release lock. io: {release_time:.2f} s for {release_bytes:,} bytes " f"( {release_bytes / release_time / 1e6:.2f} MB/sec )" ) time.sleep(1) # don't hammer td continue tic = time.perf_counter() self.cur_b = cur_b bucket_logger = BucketLogger(logger, bucket=cur_b) self.bucket_logger = bucket_logger io_bytes = self._swap_partitioned_embeddings( old_b, cur_b, old_stats) self.model.set_all_embeddings(holder, cur_b) current_index = (iteration_manager.iteration_idx + 1) * total_buckets - remaining bucket_logger.debug("Loading edges") edges = edge_storage.load_chunk_of_edges( cur_b.lhs, cur_b.rhs, edge_chunk_idx, iteration_manager.num_edge_chunks, shared=True, ) num_edges = len(edges) # this might be off in the case of tensorlist or extra edge fields io_bytes += edges.lhs.tensor.numel( ) * edges.lhs.tensor.element_size() io_bytes += edges.rhs.tensor.numel( ) * edges.rhs.tensor.element_size() io_bytes += edges.rel.numel() * edges.rel.element_size() io_time = time.perf_counter() - tic tic = time.perf_counter() bucket_logger.debug("Shuffling edges") # Fix a seed to get the same permutation every time; have it # depend on all and only what affects the set of edges. # Note: for the sake of efficiency, we sample eval edge idxs # from the edge set *with replacement*, meaning that there may # be duplicates of the same edge in the eval set. When we swap # edges into the eval set, if there are duplicates then all # but one will be clobbered. These collisions are unlikely # if eval_fraction is small. # # Importantly, this eval sampling strategy is theoretically # sound: # * Training and eval sets are (exactly) disjoint # * Eval set may have (rare) duplicates, but they are # uniformly sampled so it's still an unbiased estimator # of the out-of-sample statistics num_eval_edges = int(num_edges * config.eval_fraction) num_train_edges = num_edges - num_eval_edges if num_eval_edges > 0: g = torch.Generator() g.manual_seed( hash((edge_path_idx, edge_chunk_idx, cur_b.lhs, cur_b.rhs))) eval_edge_idxs = torch.randint(num_edges, (num_eval_edges, ), dtype=torch.long, generator=g) else: eval_edge_idxs = None # HOGWILD evaluation before training eval_stats_before = self._coordinate_eval( edges, eval_edge_idxs) if eval_stats_before is not None: bucket_logger.info( f"Stats before training: {eval_stats_before}") eval_time = time.perf_counter() - tic tic = time.perf_counter() # HOGWILD training bucket_logger.debug("Waiting for workers to perform training") stats = self._coordinate_train(edges, eval_edge_idxs, epoch_idx) train_time = time.perf_counter() - tic tic = time.perf_counter() # HOGWILD evaluation after training eval_stats_after = self._coordinate_eval(edges, eval_edge_idxs) if eval_stats_after is not None: bucket_logger.info( f"Stats before training: {eval_stats_after}") eval_time += time.perf_counter() - tic bucket_logger.info( f"bucket {total_buckets - remaining} / {total_buckets} : " f"Trained {num_train_edges} edges in {train_time:.2f} s " f"( {num_train_edges / train_time / 1e6:.2g} M/sec ); " f"Eval 2*{num_eval_edges} edges in {eval_time:.2f} s " f"( {2 * num_eval_edges / eval_time / 1e6:.2g} M/sec ); " f"io: {io_time:.2f} s for {io_bytes:,} bytes ( {io_bytes / io_time / 1e6:.2f} MB/sec )" ) bucket_logger.info(f"{stats}") self.model.clear_all_embeddings() yield current_index, eval_stats_before, stats, eval_stats_after cur_stats = BucketStats( lhs_partition=cur_b.lhs, rhs_partition=cur_b.rhs, index=current_index, train=stats, eval_before=eval_stats_before, eval_after=eval_stats_after, ) # release the final bucket self._swap_partitioned_embeddings(cur_b, None, cur_stats) # Distributed Processing: all machines can leave the barrier now. self._barrier() self._maybe_write_checkpoint(epoch_idx, edge_path_idx, edge_chunk_idx) # now we're sure that all partition files exist, # so be strict about loading them self.strict = True
def _sort_shard_and_shuffle_dataset(self): # This method returns a list of dataset sample indices after # the dataset has been sorted, sharded and shuffled. # The sorting of the dataset happens based on the group_size and complexities # of each sample. # Sharding happens across the number of workers. # Shuffling is done either before sharding on the group indices (if group_size is provided) # or on the dataset sample indices if the group_size is not provided. def sort_in_groups(sample_complexities, group_size): """Sort the dataset samples indices inside each group of size group_size.""" # If the group_size is None, the entire dataset is considered as a single group if group_size is None: group_size = len(sample_complexities) # Sort the dataset samples inside each group of the dataset based on sample complexity. for group_begin_index in range(0, len(sample_complexities), group_size): group_end_index = min(group_begin_index + group_size, len(sample_complexities)) sorted_indices = group_begin_index + np.argsort( sample_complexities[group_begin_index:group_end_index, 1]) sample_complexities[ group_begin_index: group_end_index, :] = sample_complexities[sorted_indices] return sample_complexities # Get the samples and their complexities from the complexity_fn if not self.sample_complexities: self.sample_complexities = np.empty((len(self.dataset), 2), dtype=np.int64) for sample_index in range(len(self.dataset)): self.sample_complexities[sample_index][0] = sample_index self.sample_complexities[sample_index][1] = self.complexity_fn( self.dataset[sample_index]) if self.random_number is None: max_complexity = max(self.sample_complexities, key=lambda t: t[1])[1] min_complexity = min(self.sample_complexities, key=lambda t: t[1])[1] self.random_number = int((max_complexity - min_complexity) * self.random_level + 1) sample_complexities = self.sample_complexities.copy() # Control the degree of load balancing by modifying the complexities of # all samples using the random_number. g = torch.Generator() g = g.manual_seed(self.seed + self.epoch) if self.random_number > 1: complexity_random_ints = torch.randint( self.random_number, (len(sample_complexities), ), generator=g).tolist() for index, random_int in enumerate(complexity_random_ints): sample_complexities[index][1] += random_int # Sort the data based on the computed complexities and group sizes. # Sort only once if random_number <= 1 else sort everytime if self.ordered_sample_complexities is None or self.random_number > 1: self.ordered_sample_complexities = sort_in_groups( sample_complexities, self.group_size) ordered_sample_complexities = self.ordered_sample_complexities # If group_size is not None, shuffle the index of each group instead # of shuffling the data indices. if self.shuffle and self.group_size is not None: num_groups = (len(self.sample_complexities) + self.group_size - 1) // self.group_size group_order = torch.randperm(num_groups, generator=g).tolist() end = 0 sample_complexities_copy = ordered_sample_complexities.copy() for group_index in group_order: original_list_begin_index = self.group_size * group_index original_list_end_index = min( original_list_begin_index + self.group_size, len(sample_complexities)) begin = end end = begin + (original_list_end_index - original_list_begin_index) sample_complexities_copy[begin:end, :] = sample_complexities[ original_list_begin_index:original_list_end_index, :] ordered_sample_complexities = sample_complexities_copy # Shard the data across the different workers. index_chunks = list( _shard_wrapped_indices_across_workers( [ index_complexity_tuple[0] for index_complexity_tuple in ordered_sample_complexities ], self.world_size, self.num_samples, )) # Shuffle the sharded data indices deterministically based on epoch and seed. chunk_indices = list(range(len(index_chunks))) if self.shuffle and self.group_size is None: chunk_indices = torch.randperm(len(index_chunks), generator=g).tolist() if not self.drop_last: # Add extra samples to make it evenly divisible padding_size = self.num_samples - len(chunk_indices) if padding_size <= len(chunk_indices): chunk_indices += chunk_indices[:padding_size] else: chunk_indices += (chunk_indices * math.ceil( padding_size / len(chunk_indices)))[:padding_size] else: # Remove tail of data to make it evenly divisible. chunk_indices = chunk_indices[:self.num_samples] assert len(chunk_indices) == self.num_samples return index_chunks, chunk_indices
def generate_images( ctx: click.Context, network_pkl: str, seeds: Optional[List[int]], truncation_psi: object, noise_mode: str, outdir: str, class_idx: Optional[int], projected_w: Optional[str] ): """Generate images using pretrained network pickle. Examples: \b # Generate curated MetFaces images without truncation (Fig.10 left) python generate.py --outdir=out --trunc=1 --seeds=85,265,297,849 \\ --network=https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/metfaces.pkl \b # Generate uncurated MetFaces images with truncation (Fig.12 upper left) python generate.py --outdir=out --trunc=0.7 --seeds=600-605 \\ --network=https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/metfaces.pkl \b # Generate class conditional CIFAR-10 images (Fig.17 left, Car) python generate.py --outdir=out --seeds=0-35 --class=1 \\ --network=https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/cifar10.pkl \b # Render an image from projected W python generate.py --outdir=out --projected_w=projected_w.npz \\ --network=https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/metfaces.pkl """ print('Loading networks from "%s"...' % network_pkl) if network_pkl == 'latest': files = glob.glob("training-runs/*/*.pkl") files.sort(key=os.path.getmtime) network_pkl = files[-1] device = torch.device('cuda') with dnnlib.util.open_url(network_pkl) as f: G = legacy.load_network_pkl(f)['G_ema'].to(device) # type: ignore os.makedirs(outdir, exist_ok=True) # Synthesize the result of a W projection. if projected_w is not None: if seeds is not None: print ('warn: --seeds is ignored when using --projected-w') print(f'Generating images from projected W "{projected_w}"') ws = np.load(projected_w)['w'] ws = torch.tensor(ws, device=device) # pylint: disable=not-callable assert ws.shape[1:] == (G.num_ws, G.w_dim) for idx, w in enumerate(ws): img = G.synthesis(w.unsqueeze(0), noise_mode=noise_mode) img = (img.permute(0, 2, 3, 1) * 127.5 + 128).clamp(0, 255).to(torch.uint8) img = PIL.Image.fromarray(img[0].cpu().numpy(), 'RGB').save(f'{outdir}/proj{idx:02d}.png') return if seeds is None: ctx.fail('--seeds option is required when not using --projected-w') # Labels. label = torch.zeros([1, G.c_dim], device=device) if G.c_dim != 0: if class_idx is None: ctx.fail('Must specify class label with --class when using a conditional network') label[:, class_idx] = 1 else: if class_idx is not None: print ('warn: --class=lbl ignored when running on an unconditional network') # Generate images. for seed_idx, seed in enumerate(seeds): print('Generating image for seed %d (%d/%d) ...' % (seed, seed_idx, len(seeds))) z = torch.from_numpy(np.random.RandomState(seed).randn(1, G.z_dim)).to(device) rand_gen = None if noise_mode == 'generator': rand_gen = torch.Generator(device=device) rand_gen.manual_seed(seed) if truncation_psi == 'seed': tpsi = (np.random.RandomState(seed).randint(0, 100) / 100) + 0.5 else: tpsi = truncation_psi img = G(z, label, truncation_psi=tpsi, noise_mode=noise_mode, rand_gen=rand_gen) img = (img.permute(0, 2, 3, 1) * 127.5 + 128).clamp(0, 255).to(torch.uint8)[0].cpu().numpy() if img.shape[-1] == 1: img = np.repeat(img, 3, -1) PIL.Image.fromarray(img, 'RGB').save(f'{outdir}/seed{seed:04d}.png')
def _setup_przs(device=None): """ Generate shared random seeds to generate pseudo-random sharings of zero. The random seeds are shared such that each process shares one seed with the previous rank process and one with the next rank. This allows for the generation of `n` random values, each known to exactly two of the `n` parties. For arithmetic sharing, one of these parties will add the number while the other subtracts it, allowing for the generation of a pseudo-random sharing of zero. (This can be done for binary sharing using bitwise-xor rather than addition / subtraction) """ # Initialize RNG Generators comm.get().g0 = torch.Generator() comm.get().g1 = torch.Generator() device = "cuda" if device is None else device device = torch.device(device) assert device.type == "cuda", "Must be a GPU device" if torch.cuda.is_available(): comm.get().g0_cuda = torch.Generator(device=device) comm.get().g1_cuda = torch.Generator(device=device) # Generate random seeds for Generators # NOTE: Chosen seed can be any number, but we choose as a random 64-bit # integer here so other parties cannot guess its value. # We sometimes get here from a forked process, which causes all parties # to have the same RNG state. Reset the seed to make sure RNG streams # are different in all the parties. We use numpy's random here since # setting its seed to None will produce different seeds even from # forked processes. import numpy numpy.random.seed(seed=None) next_seed = torch.tensor(numpy.random.randint(-(2**63), 2**63 - 1, (1, ))) prev_seed = torch.LongTensor([0]) # placeholder # Send random seed to next party, receive random seed from prev party world_size = comm.get().get_world_size() rank = comm.get().get_rank() if world_size >= 2: # Otherwise sending seeds will segfault. next_rank = (rank + 1) % world_size prev_rank = (next_rank - 2) % world_size req0 = comm.get().isend(tensor=next_seed, dst=next_rank) req1 = comm.get().irecv(tensor=prev_seed, src=prev_rank) req0.wait() req1.wait() else: prev_seed = next_seed # Seed Generators comm.get().g0.manual_seed(next_seed.item()) comm.get().g1.manual_seed(prev_seed.item()) # Create global generator global_seed = torch.tensor(numpy.random.randint(-(2**63), 2**63 - 1, (1, ))) global_seed = comm.get().broadcast(global_seed, 0) comm.get().global_generator = torch.Generator() comm.get().global_generator.manual_seed(global_seed.item())
def __init__(self, data_source): self.data_source = data_source self.gen = torch.Generator().manual_seed(0)
p_drop = 0.5 learning_rate = 5e-4 classifiers = [LatticeClassifier, ConvClassifier] #, HybridClassifier] classifier_names = ["lattice", "conv"] #,"hybrid"] for (Classifier, name) in zip(classifiers, classifier_names): train_accuracy = torch.zeros(n_epochs, n_trials) test_accuracy = torch.zeros(n_epochs, n_trials) train_loss = torch.zeros(n_epochs, n_trials) #print("testing model '{:s}'".format(name)) for trial in range(n_trials): trial_start = time.time() data = [[X[index, :, :, :], Y[index]] for index in range(X.shape[0])] training_data, testing_data = random_split( data, [len(data) - len(data) // 10, len(data) // 10], generator=torch.Generator().manual_seed(42 + trial)) trainloader = DataLoader(training_data, batch_size=128, shuffle=True, pin_memory=True) testloader = DataLoader(testing_data, batch_size=128, shuffle=False, pin_memory=True) print("{:s} trial {:d}".format(name, trial + 1)) model = Classifier(feature_dim, n_features, n_classes, alpha=alpha, p_drop=p_drop) model = model.to(device)
def main_worker(args): global start_epoch, best_recall5 init_dist(args.launcher, args) synchronize() if args.seed is not None: random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) if args.deterministic: cudnn.deterministic = True cudnn.benchmark = False print("Use GPU: {} for training, rank no.{} of world_size {}".format( args.gpu, args.rank, args.world_size)) if (args.rank == 0): sys.stdout = Logger(osp.join(args.logs_dir, 'log.txt')) print("==========\nArgs:{}\n==========".format(args)) # Create data loaders iters = args.iters if (args.iters > 0) else None dataset, train_loader, val_loader, test_loader, sampler, train_extract_loader = get_data( args, iters) # Create model model = get_model(args) # Load from checkpoint if args.resume: checkpoint = load_checkpoint(args.resume) copy_state_dict(checkpoint['state_dict'], model) start_epoch = checkpoint['epoch'] + 1 best_recall5 = checkpoint['best_recall5'] if (args.rank == 0): print("=> Start epoch {} best recall5 {:.1%}".format( start_epoch, best_recall5)) # Evaluator evaluator = Evaluator(model) if (args.rank == 0): print("Test the initial model:") recalls = evaluator.evaluate( val_loader, sorted(list(set(dataset.q_val) | set(dataset.db_val))), dataset.q_val, dataset.db_val, dataset.val_pos, vlad=args.vlad, gpu=args.gpu, sync_gather=args.sync_gather) # Optimizer optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.step_size, gamma=0.5) # Trainer trainer = Trainer(model, margin=args.margin**0.5, gpu=args.gpu) if ((args.cache_size < args.tuple_size) or (args.cache_size > len(dataset.q_train))): args.cache_size = len(dataset.q_train) # Start training for epoch in range(start_epoch, args.epochs): sampler.set_epoch(args.seed + epoch) args.cache_size = args.cache_size * (2**(epoch // args.step_size)) g = torch.Generator() g.manual_seed(args.seed + epoch) subset_indices = torch.randperm(len( dataset.q_train), generator=g).long().split(args.cache_size) for subid, subset in enumerate(subset_indices): update_sampler(sampler, model, train_extract_loader, dataset.q_train, dataset.db_train, subset.tolist(), vlad=args.vlad, gpu=args.gpu, sync_gather=args.sync_gather) synchronize() trainer.train(epoch, subid, train_loader, optimizer, train_iters=len(train_loader), print_freq=args.print_freq, vlad=args.vlad, loss_type=args.loss_type) synchronize() if ((epoch + 1) % args.eval_step == 0 or (epoch == args.epochs - 1)): recalls = evaluator.evaluate( val_loader, sorted(list(set(dataset.q_val) | set(dataset.db_val))), dataset.q_val, dataset.db_val, dataset.val_pos, vlad=args.vlad, gpu=args.gpu, sync_gather=args.sync_gather) is_best = recalls[1] > best_recall5 best_recall5 = max(recalls[1], best_recall5) if (args.rank == 0): save_checkpoint( { 'state_dict': model.state_dict(), 'epoch': epoch, 'best_recall5': best_recall5, }, is_best, fpath=osp.join(args.logs_dir, 'checkpoint' + str(epoch) + '.pth')) print( '\n * Finished epoch {:3d} recall@1: {:5.1%} recall@5: {:5.1%} recall@10: {:5.1%} best@5: {:5.1%}{}\n' .format(epoch, recalls[0], recalls[1], recalls[2], best_recall5, ' *' if is_best else '')) lr_scheduler.step() synchronize() # final inference if (args.rank == 0): print("Performing PCA reduction on the best model:") model.load_state_dict( load_checkpoint(osp.join(args.logs_dir, 'model_best.pth'))['state_dict']) pca_parameters_path = osp.join(args.logs_dir, 'pca_params_model_best.h5') pca = PCA(args.features, (not args.nowhiten), pca_parameters_path) dict_f = extract_features( model, train_extract_loader, sorted(list(set(dataset.q_train) | set(dataset.db_train))), vlad=args.vlad, gpu=args.gpu, sync_gather=args.sync_gather) features = list(dict_f.values()) if (len(features) > 10000): features = random.sample(features, 10000) features = torch.stack(features) if (args.rank == 0): pca.train(features) synchronize() del features if (args.rank == 0): print("Testing on Pitts30k-test:") evaluator.evaluate(test_loader, sorted(list(set(dataset.q_test) | set(dataset.db_test))), dataset.q_test, dataset.db_test, dataset.test_pos, vlad=args.vlad, pca=pca, gpu=args.gpu, sync_gather=args.sync_gather) synchronize() return
def train_and_report_stats( config: ConfigSchema, model: Optional[MultiRelationEmbedder] = None, trainer: Optional[AbstractBatchProcessor] = None, evaluator: Optional[AbstractBatchProcessor] = None, rank: Rank = RANK_ZERO, subprocess_init: Optional[Callable[[], None]] = None, ) -> Generator[Tuple[int, Optional[Stats], Stats, Optional[Stats]], None, None]: """Each epoch/pass, for each partition pair, loads in embeddings and edgelist from disk, runs HOGWILD training on them, and writes partitions back to disk. """ tag_logs_with_process_name(f"Trainer-{rank}") if config.verbose > 0: import pprint pprint.PrettyPrinter().pprint(config.to_dict()) logger.info("Loading entity counts...") entity_counts: Dict[str, List[int]] = {} for entity, econf in config.entities.items(): entity_counts[entity] = [] for part in range(econf.num_partitions): with open(os.path.join( config.entity_path, "entity_count_%s_%d.txt" % (entity, part) ), "rt") as tf: entity_counts[entity].append(int(tf.read().strip())) # Figure out how many lhs and rhs partitions we need nparts_lhs, lhs_partitioned_types = get_partitioned_types(config, Side.LHS) nparts_rhs, rhs_partitioned_types = get_partitioned_types(config, Side.RHS) logger.debug( f"nparts {nparts_lhs} {nparts_rhs} " f"types {lhs_partitioned_types} {rhs_partitioned_types}") total_buckets = nparts_lhs * nparts_rhs sync: AbstractSynchronizer bucket_scheduler: AbstractBucketScheduler parameter_sharer: Optional[ParameterSharer] partition_client: Optional[PartitionClient] if config.num_machines > 1: if not 0 <= rank < config.num_machines: raise RuntimeError("Invalid rank for trainer") if not td.is_available(): raise RuntimeError("The installed PyTorch version doesn't provide " "distributed training capabilities.") ranks = ProcessRanks.from_num_invocations( config.num_machines, config.num_partition_servers) if rank == RANK_ZERO: logger.info("Setup lock server...") start_server( LockServer( num_clients=len(ranks.trainers), nparts_lhs=nparts_lhs, nparts_rhs=nparts_rhs, lock_lhs=len(lhs_partitioned_types) > 0, lock_rhs=len(rhs_partitioned_types) > 0, init_tree=config.distributed_tree_init_order, ), process_name="LockServer", init_method=config.distributed_init_method, world_size=ranks.world_size, server_rank=ranks.lock_server, groups=[ranks.trainers], subprocess_init=subprocess_init, ) bucket_scheduler = DistributedBucketScheduler( server_rank=ranks.lock_server, client_rank=ranks.trainers[rank], ) logger.info("Setup param server...") start_server( ParameterServer(num_clients=len(ranks.trainers)), process_name=f"ParamS-{rank}", init_method=config.distributed_init_method, world_size=ranks.world_size, server_rank=ranks.parameter_servers[rank], groups=[ranks.trainers], subprocess_init=subprocess_init, ) parameter_sharer = ParameterSharer( process_name=f"ParamC-{rank}", client_rank=ranks.parameter_clients[rank], all_server_ranks=ranks.parameter_servers, init_method=config.distributed_init_method, world_size=ranks.world_size, groups=[ranks.trainers], subprocess_init=subprocess_init, ) if config.num_partition_servers == -1: start_server( ParameterServer(num_clients=len(ranks.trainers)), process_name=f"PartS-{rank}", init_method=config.distributed_init_method, world_size=ranks.world_size, server_rank=ranks.partition_servers[rank], groups=[ranks.trainers], subprocess_init=subprocess_init, ) if len(ranks.partition_servers) > 0: partition_client = PartitionClient(ranks.partition_servers) else: partition_client = None groups = init_process_group( rank=ranks.trainers[rank], world_size=ranks.world_size, init_method=config.distributed_init_method, groups=[ranks.trainers], ) trainer_group, = groups sync = DistributedSynchronizer(trainer_group) else: sync = DummySynchronizer() bucket_scheduler = SingleMachineBucketScheduler( nparts_lhs, nparts_rhs, config.bucket_order) parameter_sharer = None partition_client = None hide_distributed_logging() # fork early for HOGWILD threads logger.info("Creating workers...") num_workers = get_num_workers(config.workers) pool = create_pool( num_workers, subprocess_name=f"TWorker-{rank}", subprocess_init=subprocess_init, ) def make_optimizer(params: Iterable[torch.nn.Parameter], is_emb: bool) -> Optimizer: params = list(params) if len(params) == 0: optimizer = DummyOptimizer() elif is_emb: optimizer = RowAdagrad(params, lr=config.lr) else: if config.relation_lr is not None: lr = config.relation_lr else: lr = config.lr optimizer = Adagrad(params, lr=lr) optimizer.share_memory() return optimizer # background_io is only supported in single-machine mode background_io = config.background_io and config.num_machines == 1 checkpoint_manager = CheckpointManager( config.checkpoint_path, background=background_io, rank=rank, num_machines=config.num_machines, partition_client=partition_client, subprocess_name=f"BackgRW-{rank}", subprocess_init=subprocess_init, ) checkpoint_manager.register_metadata_provider(ConfigMetadataProvider(config)) checkpoint_manager.write_config(config) iteration_manager = IterationManager( config.num_epochs, config.edge_paths, config.num_edge_chunks, iteration_idx=checkpoint_manager.checkpoint_version) checkpoint_manager.register_metadata_provider(iteration_manager) if config.init_path is not None: loadpath_manager = CheckpointManager(config.init_path) else: loadpath_manager = None def load_embeddings( entity: EntityName, part: Partition, strict: bool = False, force_dirty: bool = False, ) -> Tuple[torch.nn.Parameter, Optional[OptimizerStateDict]]: if strict: embs, optim_state = checkpoint_manager.read(entity, part, force_dirty=force_dirty) else: # Strict is only false during the first iteration, because in that # case the checkpoint may not contain any data (unless a previous # run was resumed) so we fall back on initial values. embs, optim_state = checkpoint_manager.maybe_read(entity, part, force_dirty=force_dirty) if embs is None and loadpath_manager is not None: embs, optim_state = loadpath_manager.maybe_read(entity, part) if embs is None: embs, optim_state = init_embs(entity, entity_counts[entity][part], config.dimension, config.init_scale) assert embs.is_shared() return torch.nn.Parameter(embs), optim_state logger.info("Initializing global model...") if model is None: model = make_model(config) model.share_memory() if trainer is None: trainer = Trainer( global_optimizer=make_optimizer(model.parameters(), False), loss_fn=config.loss_fn, margin=config.margin, relations=config.relations, ) if evaluator is None: evaluator = TrainingRankingEvaluator( override_num_batch_negs=config.eval_num_batch_negs, override_num_uniform_negs=config.eval_num_uniform_negs, ) eval_batch_size = round_up_to_nearest_multiple(config.batch_size, config.eval_num_batch_negs) state_dict, optim_state = checkpoint_manager.maybe_read_model() if state_dict is None and loadpath_manager is not None: state_dict, optim_state = loadpath_manager.maybe_read_model() if state_dict is not None: model.load_state_dict(state_dict, strict=False) if optim_state is not None: trainer.global_optimizer.load_state_dict(optim_state) logger.debug("Loading unpartitioned entities...") for entity, econfig in config.entities.items(): if econfig.num_partitions == 1: embs, optim_state = load_embeddings(entity, Partition(0)) model.set_embeddings(entity, embs, Side.LHS) model.set_embeddings(entity, embs, Side.RHS) optimizer = make_optimizer([embs], True) if optim_state is not None: optimizer.load_state_dict(optim_state) trainer.entity_optimizers[(entity, Partition(0))] = optimizer # start communicating shared parameters with the parameter server if parameter_sharer is not None: parameter_sharer.share_model_params(model) strict = False def swap_partitioned_embeddings( old_b: Optional[Bucket], new_b: Optional[Bucket], ): # 0. given the old and new buckets, construct data structures to keep # track of old and new embedding (entity, part) tuples io_bytes = 0 logger.info(f"Swapping partitioned embeddings {old_b} {new_b}") types = ([(e, Side.LHS) for e in lhs_partitioned_types] + [(e, Side.RHS) for e in rhs_partitioned_types]) old_parts = {(e, old_b.get_partition(side)): side for e, side in types if old_b is not None} new_parts = {(e, new_b.get_partition(side)): side for e, side in types if new_b is not None} to_checkpoint = set(old_parts) - set(new_parts) preserved = set(old_parts) & set(new_parts) # 1. checkpoint embeddings that will not be used in the next pair # if old_b is not None: # there are previous embeddings to checkpoint logger.info("Writing partitioned embeddings") for entity, part in to_checkpoint: side = old_parts[(entity, part)] side_name = side.pick("lhs", "rhs") logger.debug(f"Checkpointing ({entity} {part} {side_name})") embs = model.get_embeddings(entity, side) optim_key = (entity, part) optim_state = OptimizerStateDict(trainer.entity_optimizers[optim_key].state_dict()) io_bytes += embs.numel() * embs.element_size() # ignore optim state checkpoint_manager.write(entity, part, embs.detach(), optim_state) if optim_key in trainer.entity_optimizers: del trainer.entity_optimizers[optim_key] # these variables are holding large objects; let them be freed del embs del optim_state bucket_scheduler.release_bucket(old_b) # 2. copy old embeddings that will be used in the next pair # into a temporary dictionary # tmp_emb = {x: model.get_embeddings(x[0], old_parts[x]) for x in preserved} for entity, _ in types: model.clear_embeddings(entity, Side.LHS) model.clear_embeddings(entity, Side.RHS) if new_b is None: # there are no new embeddings to load return io_bytes bucket_logger = BucketLogger(logger, bucket=new_b) # 3. load new embeddings into the model/optimizer, either from disk # or the temporary dictionary # bucket_logger.info("Loading entities") for entity, side in types: part = new_b.get_partition(side) part_key = (entity, part) if part_key in tmp_emb: bucket_logger.debug(f"Loading ({entity}, {part}) from preserved") embs, optim_state = tmp_emb[part_key], None else: bucket_logger.debug(f"Loading ({entity}, {part})") force_dirty = bucket_scheduler.check_and_set_dirty(entity, part) embs, optim_state = load_embeddings( entity, part, strict=strict, force_dirty=force_dirty) io_bytes += embs.numel() * embs.element_size() # ignore optim state model.set_embeddings(entity, embs, side) tmp_emb[part_key] = embs optim_key = (entity, part) if optim_key not in trainer.entity_optimizers: bucket_logger.debug(f"Resetting optimizer {optim_key}") optimizer = make_optimizer([embs], True) if optim_state is not None: bucket_logger.debug("Setting optim state") optimizer.load_state_dict(optim_state) trainer.entity_optimizers[optim_key] = optimizer return io_bytes # Start of the main training loop. for epoch_idx, edge_path_idx, edge_chunk_idx in iteration_manager: logger.info( f"Starting epoch {epoch_idx + 1} / {iteration_manager.num_epochs}, " f"edge path {edge_path_idx + 1} / {iteration_manager.num_edge_paths}, " f"edge chunk {edge_chunk_idx + 1} / {iteration_manager.num_edge_chunks}") edge_reader = EdgeReader(iteration_manager.edge_path) logger.info(f"Edge path: {iteration_manager.edge_path}") sync.barrier() dist_logger.info("Lock client new epoch...") bucket_scheduler.new_pass(is_first=iteration_manager.iteration_idx == 0) sync.barrier() remaining = total_buckets cur_b = None while remaining > 0: old_b = cur_b io_time = 0. io_bytes = 0 cur_b, remaining = bucket_scheduler.acquire_bucket() logger.info(f"still in queue: {remaining}") if cur_b is None: if old_b is not None: # if you couldn't get a new pair, release the lock # to prevent a deadlock! tic = time.time() io_bytes += swap_partitioned_embeddings(old_b, None) io_time += time.time() - tic time.sleep(1) # don't hammer td continue bucket_logger = BucketLogger(logger, bucket=cur_b) tic = time.time() io_bytes += swap_partitioned_embeddings(old_b, cur_b) current_index = \ (iteration_manager.iteration_idx + 1) * total_buckets - remaining next_b = bucket_scheduler.peek() if next_b is not None and background_io: # Ensure the previous bucket finished writing to disk. checkpoint_manager.wait_for_marker(current_index - 1) bucket_logger.debug("Prefetching") for entity in lhs_partitioned_types: checkpoint_manager.prefetch(entity, next_b.lhs) for entity in rhs_partitioned_types: checkpoint_manager.prefetch(entity, next_b.rhs) checkpoint_manager.record_marker(current_index) bucket_logger.debug("Loading edges") edges = edge_reader.read( cur_b.lhs, cur_b.rhs, edge_chunk_idx, config.num_edge_chunks) num_edges = len(edges) # this might be off in the case of tensorlist or extra edge fields io_bytes += edges.lhs.tensor.numel() * edges.lhs.tensor.element_size() io_bytes += edges.rhs.tensor.numel() * edges.rhs.tensor.element_size() io_bytes += edges.rel.numel() * edges.rel.element_size() bucket_logger.debug("Shuffling edges") # Fix a seed to get the same permutation every time; have it # depend on all and only what affects the set of edges. g = torch.Generator() g.manual_seed(hash((edge_path_idx, edge_chunk_idx, cur_b.lhs, cur_b.rhs))) num_eval_edges = int(num_edges * config.eval_fraction) if num_eval_edges > 0: edge_perm = torch.randperm(num_edges, generator=g) eval_edge_perm = edge_perm[-num_eval_edges:] num_edges -= num_eval_edges edge_perm = edge_perm[torch.randperm(num_edges)] else: edge_perm = torch.randperm(num_edges) # HOGWILD evaluation before training eval_stats_before: Optional[Stats] = None if num_eval_edges > 0: bucket_logger.debug("Waiting for workers to perform evaluation") future_all_eval_stats_before = pool.map_async(call, [ partial( process_in_batches, batch_size=eval_batch_size, model=model, batch_processor=evaluator, edges=edges, indices=eval_edge_perm[s], ) for s in split_almost_equally(eval_edge_perm.size(0), num_parts=num_workers) ]) all_eval_stats_before = \ get_async_result(future_all_eval_stats_before, pool) eval_stats_before = Stats.sum(all_eval_stats_before).average() bucket_logger.info(f"Stats before training: {eval_stats_before}") io_time += time.time() - tic tic = time.time() # HOGWILD training bucket_logger.debug("Waiting for workers to perform training") # FIXME should we only delay if iteration_idx == 0? future_all_stats = pool.map_async(call, [ partial( process_in_batches, batch_size=config.batch_size, model=model, batch_processor=trainer, edges=edges, indices=edge_perm[s], delay=config.hogwild_delay if epoch_idx == 0 and rank > 0 else 0, ) for rank, s in enumerate(split_almost_equally(edge_perm.size(0), num_parts=num_workers)) ]) all_stats = get_async_result(future_all_stats, pool) stats = Stats.sum(all_stats).average() compute_time = time.time() - tic bucket_logger.info( f"bucket {total_buckets - remaining} / {total_buckets} : " f"Processed {num_edges} edges in {compute_time:.2f} s " f"( {num_edges / compute_time / 1e6:.2g} M/sec ); " f"io: {io_time:.2f} s ( {io_bytes / io_time / 1e6:.2f} MB/sec )") bucket_logger.info(f"{stats}") # HOGWILD eval after training eval_stats_after: Optional[Stats] = None if num_eval_edges > 0: bucket_logger.debug("Waiting for workers to perform evaluation") future_all_eval_stats_after = pool.map_async(call, [ partial( process_in_batches, batch_size=eval_batch_size, model=model, batch_processor=evaluator, edges=edges, indices=eval_edge_perm[s], ) for s in split_almost_equally(eval_edge_perm.size(0), num_parts=num_workers) ]) all_eval_stats_after = \ get_async_result(future_all_eval_stats_after, pool) eval_stats_after = Stats.sum(all_eval_stats_after).average() bucket_logger.info(f"Stats after training: {eval_stats_after}") # Add train/eval metrics to queue yield current_index, eval_stats_before, stats, eval_stats_after swap_partitioned_embeddings(cur_b, None) # Distributed Processing: all machines can leave the barrier now. sync.barrier() # Preserving a checkpoint requires two steps: # - create a snapshot (w/ symlinks) after it's first written; # - don't delete it once the following one is written. # These two happen in two successive iterations of the main loop: the # one just before and the one just after the epoch boundary. preserve_old_checkpoint = should_preserve_old_checkpoint( iteration_manager, config.checkpoint_preservation_interval) preserve_new_checkpoint = should_preserve_old_checkpoint( iteration_manager + 1, config.checkpoint_preservation_interval) # Write metadata: for multiple machines, write from rank-0 logger.info( f"Finished epoch {epoch_idx + 1} / {iteration_manager.num_epochs}, " f"edge path {edge_path_idx + 1} / {iteration_manager.num_edge_paths}, " f"edge chunk {edge_chunk_idx + 1} / {iteration_manager.num_edge_chunks}") if rank == 0: for entity, econfig in config.entities.items(): if econfig.num_partitions == 1: embs = model.get_embeddings(entity, Side.LHS) optimizer = trainer.entity_optimizers[(entity, Partition(0))] checkpoint_manager.write( entity, Partition(0), embs.detach(), OptimizerStateDict(optimizer.state_dict())) sanitized_state_dict: ModuleStateDict = {} for k, v in ModuleStateDict(model.state_dict()).items(): if k.startswith('lhs_embs') or k.startswith('rhs_embs'): # skipping state that's an entity embedding continue sanitized_state_dict[k] = v logger.info("Writing the metadata") checkpoint_manager.write_model( sanitized_state_dict, OptimizerStateDict(trainer.global_optimizer.state_dict()), ) logger.info("Writing the checkpoint") checkpoint_manager.write_new_version(config) dist_logger.info("Waiting for other workers to write their parts of the checkpoint") sync.barrier() dist_logger.info("All parts of the checkpoint have been written") logger.info("Switching to the new checkpoint version") checkpoint_manager.switch_to_new_version() dist_logger.info("Waiting for other workers to switch to the new checkpoint version") sync.barrier() dist_logger.info("All workers have switched to the new checkpoint version") # After all the machines have finished committing # checkpoints, we either remove the old checkpoints # or we preserve it if preserve_new_checkpoint: # Add 1 so the index is a multiple of the interval, it looks nicer. checkpoint_manager.preserve_current_version(config, epoch_idx + 1) if not preserve_old_checkpoint: checkpoint_manager.remove_old_version(config) # now we're sure that all partition files exist, # so be strict about loading them strict = True # quiescence pool.close() pool.join() sync.barrier() checkpoint_manager.close() if loadpath_manager is not None: loadpath_manager.close() # FIXME join distributed workers (not really necessary) logger.info("Exiting")
def __init__( self, root: Union[str, Path], *, download: bool = True, transform: Optional[ImageTform] = None, label_map: Optional[Dict[str, int]] = None, colors: Optional[List[int]] = None, num_colors: int = 10, scale: float = 0.2, correlation: Optional[float] = None, binarize: bool = False, greyscale: bool = False, background: bool = False, black: bool = True, split: Optional[Union[ColoredMNISTSplit, str]] = None, seed: Optional[int] = 42, ) -> None: self.split = (str_to_enum(str_=split, enum=ColoredMNISTSplit) if isinstance(split, str) else split) self.label_map = label_map self.scale = scale self.num_colors = num_colors self.colors = colors self.background = background self.binarize = binarize self.black = black self.greyscale = greyscale self.seed = seed # Note: a correlation coefficient of '1' corresponds to perfect correlation between # digit and class while a correlation coefficient of '-1' corresponds to perfect # anti-correlation. if correlation is None: correlation = 1.0 if split is ColoredMNISTSplit.train else 0.5 if not 0 <= correlation <= 1: raise ValueError( "Strength of correlation between colour and targets must be between 0 and 1." ) self.correlation = correlation if self.split is None: x_ls, y_ls = [], [] for _split in ColoredMNISTSplit: base_dataset = MNIST(root=str(root), download=download, train=_split is ColoredMNISTSplit.train) x_ls.append(base_dataset.data) y_ls.append(base_dataset.targets) x = torch.cat(x_ls, dim=0) y = torch.cat(y_ls, dim=0) else: base_dataset = MNIST(root=str(root), download=download, train=self.split is ColoredMNISTSplit.train) x = base_dataset.data y = base_dataset.targets if self.label_map is not None: x, y = _filter_data_by_labels(data=x, targets=y, label_map=self.label_map) s = y % self.num_colors s_unique, s_unique_inv = s.unique(return_inverse=True) generator = (torch.default_generator if self.seed is None else torch.Generator().manual_seed(self.seed)) inv_card_s = 1 / len(s_unique) if self.correlation < 1: flip_prop = self.correlation * (1.0 - inv_card_s) + inv_card_s # Change the values of randomly-selected labels to values other than their original ones num_to_flip = round((1 - flip_prop) * len(s)) to_flip = torch.randperm(len(s), generator=generator)[:num_to_flip] s_unique_inv[to_flip] += torch.randint(low=1, high=len(s_unique), size=(num_to_flip, )) # s labels live inside the Z/(num_colors * Z) ring s_unique_inv[to_flip] %= len(s_unique) s = s_unique[s_unique_inv] # Convert the greyscale iamges of shape ( H, W ) into 'colour' images of shape ( C, H, W ) colorizer = MNISTColorizer( scale=self.scale, background=self.background, black=self.black, binarize=self.binarize, greyscale=self.greyscale, color_indices=self.colors, seed=self.seed, ) x_colorized = colorizer(images=x, labels=s) # Convert to HWC format for compatibility with transforms x_colorized = x_colorized.movedim(1, -1).numpy().astype(np.uint8) super().__init__(x=x_colorized, y=y, s=s, transform=transform, image_dir=root)
def forward(cls, ctx, input_x, drop_rate=0.5, target_fraction=1.0, train=False, inplace=False, unit_test_mode=False): rand_gen = torch.Generator() if unit_test_mode: rand_gen.manual_seed(353) if drop_rate < 0 or drop_rate > 1: raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(drop_rate)) if inplace: raise NotImplementedError("In place computations haven't been tested yet!") ctx.p = drop_rate ctx.train = train ctx.inplace = inplace ctx.input = input_x if ctx.inplace: ctx.mark_dirty(input_x) output = input_x else: output = input_x.clone() if ctx.p > 0 and ctx.train: ctx.noise = cls._make_noise(input_x) if ctx.p == 1: ctx.noise.fill_(0) else: ctx.noise.bernoulli_(1 - ctx.p, generator=rand_gen).div_(1 - ctx.p) is_filter_map = False if input_x.dim() > 3: is_filter_map = True if target_fraction < 1.0: if is_filter_map: input_shape = input_x.size() batch_size = input_shape[0] num_filters = input_shape[1] input_flattened_abs = torch.norm(input_x.view([batch_size, num_filters, -1]), 2, dim=2) feature_shape = input_flattened_abs.size()[1] n_features_to_drop = int(feature_shape*target_fraction) sorted_indices_per_row = torch.argsort(input_flattened_abs, dim=1) nth_ranked_feature_value_per_row = input_flattened_abs.gather(1,sorted_indices_per_row)[:,n_features_to_drop].view([-1,1]) targeting_mask = input_flattened_abs.lt(nth_ranked_feature_value_per_row)[:,:,None,None] # .view(input_shape) # print('targeting_mask',targeting_mask) ctx.noise = ctx.noise.where(targeting_mask, torch.tensor([1.0]).type(input_x.dtype).to(input_x.device)) else: input_shape = input_x.size() batch_size = input_shape[0] input_flattened_abs = torch.abs(input_x.view([batch_size, -1])) feature_shape = input_flattened_abs.size()[1] n_features_to_drop = int(feature_shape*target_fraction) sorted_indices_per_column = torch.argsort(input_flattened_abs, dim=1) nth_ranked_feature_value_per_column = input_flattened_abs.gather(1,sorted_indices_per_column)[:,n_features_to_drop].view([-1,1]) targeting_mask = input_flattened_abs.lt(nth_ranked_feature_value_per_column).view(input_shape) print(targeting_mask) ctx.noise = ctx.noise.where(targeting_mask, torch.tensor([1.0]).type(input_x.dtype).to(input_x.device)) output.mul_(ctx.noise) return output
def __intialise_dataset(self): ############ Determine dataset ############ if self.parameters.dataset == SelectableDatasets.BPI2012: self.dataset = XESDataset( device=self.device, file_path=EnviromentParameters.BPI2020Dataset.file_path, preprocessed_folder_path=EnviromentParameters.BPI2020Dataset.preprocessed_foldr_path, preprocessed_df_type=EnviromentParameters.BPI2020Dataset.preprocessed_df_type, include_types=self.parameters.bpi2012.BPI2012_include_types, ) elif self.parameters.dataset == SelectableDatasets.Diabetes: self.feature_names = EnviromentParameters.DiabetesDataset.feature_names self.dataset = MedicalDataset( device=self.device, file_path= EnviromentParameters.DiabetesDataset.file_path, feature_names=EnviromentParameters.DiabetesDataset.feature_names, target_col_name=EnviromentParameters.DiabetesDataset.target_name ) elif self.parameters.dataset == SelectableDatasets.Helpdesk: self.dataset = XESDataset( device=self.device, file_path=EnviromentParameters.HelpDeskDataset.file_path, preprocessed_folder_path=EnviromentParameters.HelpDeskDataset.preprocessed_foldr_path, preprocessed_df_type=EnviromentParameters.HelpDeskDataset.preprocessed_df_type, ) elif self.parameters.dataset == SelectableDatasets.BreastCancer: self.feature_names = EnviromentParameters.BreastCancerDataset.feature_names self.dataset = MedicalDataset( device=self.device, file_path= EnviromentParameters.BreastCancerDataset.file_path, feature_names=EnviromentParameters.BreastCancerDataset.feature_names, target_col_name=EnviromentParameters.BreastCancerDataset.target_name ) else: raise NotSupportedError("Dataset you selected is not supported") # Create datasets # Lengths for each set train_dataset_len = int( len(self.dataset) * self.parameters.train_test_split_portion[0] ) test_dataset_len = int( len(self.dataset) * self.parameters.train_test_split_portion[-1] ) validation_dataset_len = len(self.dataset) - ( train_dataset_len + test_dataset_len ) # Split the dataset ( self.train_dataset, self.validation_dataset, self.test_dataset, ) = torch.utils.data.random_split( dataset=self.dataset, lengths=[train_dataset_len, validation_dataset_len, test_dataset_len], generator=torch.Generator().manual_seed( self.parameters.dataset_split_seed ), ) # Initialise dataloaders self.train_data_loader = DataLoader( self.train_dataset, batch_size=self.parameters.batch_size, shuffle=self.train_dataset.dataset.get_train_shuffle(), collate_fn=self.dataset.collate_fn, sampler= self.train_dataset.dataset.get_sampler_from_df(self.train_dataset[:], self.parameters.dataset_split_seed) # num_workers=4, # worker_init_fn=lambda _: np.random.seed(int(torch.initial_seed()) % (2**32-1)), ) self.validation_data_loader = DataLoader( self.validation_dataset, batch_size=self.parameters.batch_size, shuffle=True, collate_fn=self.dataset.collate_fn, ) self.test_data_loader = DataLoader( self.test_dataset, batch_size=self.parameters.batch_size, shuffle=True, collate_fn=self.dataset.collate_fn, )
train_coefficients = hyperparam_conf['train_coefficients'] # %% mnist config dataset_config = conf['mnist_config'] max_rate = dataset_config['max_rate'] use_transform = dataset_config['use_transform'] # %% transform config if use_transform == True: rand_transform = get_rand_transform(conf['transform']) else: rand_transform = None # load mnist training dataset mnist_trainset = datasets.MNIST(root='./data', train=True, download=True, transform=rand_transform) mnist_trainset, mnist_devset = random_split(mnist_trainset, [50000, 10000], generator=torch.Generator().manual_seed(42)) # load mnist test dataset mnist_testset = datasets.MNIST(root='./data', train=False, download=True, transform=None) # acc file name acc_file_name = experiment_name + '_' + conf['acc_file_name'] # %% define model class mysnn(torch.nn.Module): def __init__(self): super().__init__() self.length = length self.batch_size = batch_size self.train_coefficients = train_coefficients
def shuffle(self, epoch): # deterministically shuffle based on epoch g = torch.Generator() g.manual_seed(epoch) bin_ids = list(torch.randperm(len(self.bins), generator=g)) self.bins = [self.bins[i] for i in bin_ids]
def __init__( self, data_dir: str, val_split: float = 0.2, test_split: float = 0.1, num_workers: int = 16, batch_size: int = 32, seed: int = 42, *args, **kwargs, ): """ Kitti train, validation and test dataloaders. Note: You need to have downloaded the Kitti dataset first and provide the path to where it is saved. You can download the dataset here: http://www.cvlibs.net/datasets/kitti/eval_semseg.php?benchmark=semantics2015 Specs: - 200 samples - Each image is (3 x 1242 x 376) In total there are 34 classes but some of these are not useful so by default we use only 19 of the classes specified by the `valid_labels` parameter. Example:: from pl_bolts.datamodules import KittiDataModule dm = KittiDataModule(PATH) model = LitModel() Trainer().fit(model, dm) Args: data_dir: where to load the data from path, i.e. '/path/to/folder/with/data_semantics/' val_split: size of validation test (default 0.2) test_split: size of test set (default 0.1) num_workers: how many workers to use for loading data batch_size: the batch size seed: random seed to be used for train/val/test splits """ if not _TORCHVISION_AVAILABLE: raise ModuleNotFoundError( # pragma: no-cover 'You want to use `torchvision` which is not installed yet.') super().__init__(*args, **kwargs) self.data_dir = data_dir if data_dir is not None else os.getcwd() self.batch_size = batch_size self.num_workers = num_workers self.seed = seed self.default_transforms = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.35675976, 0.37380189, 0.3764753], std=[0.32064945, 0.32098866, 0.32325324]) ]) # split into train, val, test kitti_dataset = KittiDataset(self.data_dir, transform=self.default_transforms) val_len = round(val_split * len(kitti_dataset)) test_len = round(test_split * len(kitti_dataset)) train_len = len(kitti_dataset) - val_len - test_len self.trainset, self.valset, self.testset = random_split( kitti_dataset, lengths=[train_len, val_len, test_len], generator=torch.Generator().manual_seed(self.seed))