def run_epoch(): self.model.train(True) losses = [] # Re-create dataloader every time we call train # This way, since epoch_length < len(dataset), we can # make sure that the dataset is randomly shuffled each time # we train for an epoch. logger.info("Preparing dataset batch...") dataset = self.shuffle_dataset_epoch_length() pbar = qqdm(enumerate(dataset), total=len(dataset), desc=format_str('blue', f'Epoch Progress')) for it, (batch) in pbar: batch = batch.to(self.model.device) output = self.model.remote_forward(self.neuron, batch, training=True) loss = output.local_target_loss + output.distillation_loss + output.remote_target_loss loss.backward() clip_grad_norm_(self.model.parameters(), self.config.miner.clip_gradients) self.optimizer.step() self.optimizer.zero_grad() self.decay_learning_rate(batch) losses.append(loss.item()) # ---- Train row weights ---- batch_weights = torch.mean(output.router.weights, axis = 0).to(self.model.device) # Average over batch. self.row = (1 - 0.03) * self.row + 0.03 * batch_weights # Moving avg update. self.row = F.normalize(self.row, p = 1, dim = 0) # Ensure normalization. pbar.set_infos({ 'GS': colored('{}'.format(self.global_step), 'red'), 'LS': colored('{}'.format(it), 'blue'), 'Epoch': colored('{}'.format(self.epoch+1), 'green'), 'Local loss': colored('{:.5f}'.format(output.local_target_loss.item()), 'red'), 'Remote loss': colored('{:.5f}'.format(output.remote_target_loss.item()), 'blue'), 'Distillation loss': colored('{:.5f}'.format(output.distillation_loss.item()), 'green'), 'Learning Rate:': colored('{:e}'.format(self.lr), 'white'), 'Axon': self.neuron.axon.__str__(), 'Dendrite': self.neuron.dendrite.__str__(), }) self.tensorboard.add_scalar('Neuron/Rloss', output.remote_target_loss.item(), self.global_step) self.tensorboard.add_scalar('Neuron/Lloss', output.local_target_loss.item(), self.global_step) self.tensorboard.add_scalar('Neuron/Dloss', output.distillation_loss.item(), self.global_step) self.global_step += 1 avg_loss = sum(losses) / len(losses) self.training_loss = avg_loss
def run_next_training_epoch(self, training_batches: List[dict]) -> float: r""" Called by miner.run(), calls training_call for passed batches. Args: training_batches (List[dict]): Training batches as returned by get_epoch_batches. """ total_epoch_loss = 0.0 progress_bar = qqdm(enumerate(training_batches), total=len(training_batches), desc=format_str('blue', f'Epoch Progress')) for iteration, (training_batch) in progress_bar: output = self.training_call(batch=training_batch) total_epoch_loss += output.local_target_loss.item() self.epoch_loss = total_epoch_loss / (iteration + 1) self.global_step += 1 self.training_logs(progress_bar, iteration=iteration, output=output)
def train_epoch(self, epoch, dataloader): epoch_losses = { "loss_epoch": 0.0, "loss_point_cloud_epoch": 0.0, "loss_field_of_view_epoch": 0.0, "loss_po2po_epoch": 0.0, "loss_po2pl_epoch": 0.0, "loss_pl2pl_epoch": 0.0, "visible_pixels_epoch": 0.0, "loss_yaw_pitch_roll_epoch": np.zeros(3), "loss_true_trafo_epoch": 0.0, } counter = 0 qqdm_dataloader = qqdm.qqdm(dataloader, desc=qqdm.format_str( 'blue', 'Epoch ' + str(epoch))) for preprocessed_dicts in qqdm_dataloader: # Load corresponnding preprocessed kd_tree for preprocessed_dict in preprocessed_dicts: # Move data to devices: for key in preprocessed_dict: if hasattr(preprocessed_dict[key], 'to'): preprocessed_dict[key] = preprocessed_dict[key].to( self.device) self.optimizer.zero_grad() epoch_losses, _ = (self.step( preprocessed_dicts=preprocessed_dicts, epoch_losses=epoch_losses, log_images_bool=counter == self.steps_per_epoch - 1 or counter == 0)) # Plotting and logging --> only first one in batch preprocessed_data = preprocessed_dicts[0] # Plot at very beginning to see initial state of the network if epoch == 0 and counter == 0 and not self.config["po2po_alone"]: self.log_image(epoch=epoch, string="_start" + "_" + preprocessed_data["dataset"]) qqdm_dataloader.set_infos({ 'loss': f'{float(epoch_losses["loss_epoch"] / (counter + 1)):.6f}', 'loss_point_cloud': f'{float(epoch_losses["loss_point_cloud_epoch"] / (counter + 1)):.6f}', 'loss_po2po': f'{float(epoch_losses["loss_po2po_epoch"] / (counter + 1)):.6f}', 'loss_po2pl': f'{float(epoch_losses["loss_po2pl_epoch"] / (counter + 1)):.6f}', 'loss_pl2pl': f'{float(epoch_losses["loss_pl2pl_epoch"] / (counter + 1)):.6f}', 'visible_pixels': f'{float(epoch_losses["visible_pixels_epoch"] / (counter + 1)):.6f}' }) counter += 1 return epoch_losses
def run( config , validator, subtensor, wallet, metagraph, dataset, device, uid, dendrite): print(config) config.to_defaults() validator = validator.to(device) optimizer = torch.optim.SGD( validator.parameters(), lr = config.neuron.learning_rate, momentum = config.neuron.momentum, ) if config.wandb.api_key != 'default': # Create wandb for telemetry. bittensor.wandb( config = config, cold_pubkey = wallet.coldkeypub.ss58_address, hot_pubkey = wallet.hotkey.ss58_address, root_dir = config.neuron.full_path ) # Optionally resume. if config.neuron.no_restart != True: try: validator.load_state_dict( torch.load("{}/validator.torch".format( config.neuron.full_path ))['validator'], strict=False ) except Exception as e: logger.error('Error reloading model: {} '.format(e)) # --- last sync block last_sync_block = subtensor.get_current_block() # --- Run Forever. epoch = 0 global_step = 0 best_loss = math.inf ema_score_decay = 0.995 ema_scores = torch.nn.Parameter(torch.zeros_like(validator.peer_weights, device = device) * (1 / metagraph.n.item()), requires_grad = False) while True: # --- Run epoch. start_block = subtensor.get_current_block() + 1 end_block = start_block + config.neuron.blocks_per_epoch blocks = [ block for block in range(start_block, end_block) ] progress = qqdm( blocks, total=len(blocks), desc=format_str('white', f'Epoch')) progress.set_bar = partial(progress.set_bar, element='#') # --- Reset the epoch logs total_epoch_score = torch.zeros(metagraph.n.item(), device = device) total_epoch_loss = 0 batch_count = 0 for block in progress: # --- Training step. current_block = subtensor.get_current_block() while block >= current_block: loss, _, query_uids = validator( next( dataset ) ) val_score = validator.scores() scores = torch.nn.functional.normalize ( torch.relu( val_score ), p=1, dim = 0 ) scores[query_uids] += 1e-6 loss.backward() clip_grad_norm_(validator.parameters(), config.neuron.clip_gradients) optimizer.step() optimizer.zero_grad() global_step += 1 batch_count += 1 total_epoch_score += scores.detach() total_epoch_loss += loss.item() ema_scores = (ema_score_decay * ema_scores) + (1 - ema_score_decay) * scores.detach() current_block = subtensor.get_current_block() # --- Step logs. info = { 'Step': colored('{}'.format(global_step), 'red'), 'Epoch': colored('{}'.format(epoch), 'yellow'), 'Best-loss': colored('{:.4f}'.format(best_loss), 'green'), 'Loss': colored('{:.4f}'.format(loss.item()), 'blue'), 'nPeers': colored(metagraph.n.item(), 'red'), 'Stake(\u03C4)': colored('{:.3f}'.format(metagraph.S[uid].item()), 'yellow'), 'Rank(\u03C4)': colored('{:.3f}'.format(metagraph.R[uid].item()), 'green'), 'Incentive(\u03C4/block)': colored('{:.6f}'.format(metagraph.I[uid].item()), 'blue'), 'Dividends': colored('{:.4f}'.format(metagraph.D[ uid ].item()), 'red'), 'Current Block': colored('{}'.format(block), 'yellow') } topk_scores, topk_idx = bittensor.unbiased_topk(ema_scores, 5, dim=0) for idx, ema_score in zip(topk_idx, topk_scores) : color = 'green' if scores[idx] - ema_score > 0 else 'red' info[f'uid_{idx.item()}'] = colored('{:.4f}'.format(ema_score), color) progress.set_infos( info ) # --- End of epoch # --- Set mechanism weights. inactive_uids = torch.where(metagraph.active == 0)[0] ema_scores[inactive_uids] = 0 topk_scores, topk_uids = bittensor.unbiased_topk( ema_scores.detach().to('cpu'), k = min(config.neuron.n_topk_peer_weights, metagraph.n.item())) subtensor.set_weights( uids = topk_uids, weights = topk_scores, wait_for_inclusion = False, wallet = wallet, ) # --- Log. epoch_loss = total_epoch_loss / batch_count active_uids = torch.where(metagraph.active > 0)[0] nn = subtensor.neuron_for_pubkey(wallet.hotkey.ss58_address) if config.wandb.api_key != 'default': wandb_data = { 'stake': nn.stake, 'dividends': nn.dividends, 'epoch_loss': epoch_loss, 'STD in scores': torch.std(ema_scores[active_uids]).item(), } df = pandas.concat( [ bittensor.utils.indexed_values_to_dataframe( prefix = 'fisher_ema_score', index = topk_uids, values = ema_scores ), dendrite.to_dataframe( metagraph = metagraph ) ], axis = 1) df['uid'] = df.index wandb_dendrite = dendrite.to_wandb() wandb.log( {**wandb_data, **wandb_dendrite}, step = current_block ) wandb.log( { 'stats': wandb.Table( dataframe = df ) }, step = current_block ) # --- Save. if best_loss > epoch_loss : best_loss = epoch_loss torch.save( { 'validator': validator.state_dict() }, "{}/validator.torch".format( config.neuron.full_path )) if current_block - last_sync_block > config.neuron.metagraph_sync: metagraph.sync() last_sync_block = current_block validator.sync_with_chain_state() chain_growth = max(0, metagraph.n.item() - torch.numel( ema_scores )) ema_scores = torch.nn.Parameter(torch.cat([ema_scores, torch.zeros([chain_growth], dtype=torch.float32, requires_grad=False, device = device)])) epoch += 1
def run_epoch(): self.model.train(True) losses = [] rlosses = [] llosses = [] dlosses = [] # we train for an epoch. logger.info("Preparing dataset batch...") # Set up the dataloader dataloader = self.dataset.dataloader(self.config.miner.epoch_length) pbar = qqdm(enumerate(dataloader), total=len(dataloader), desc=format_str('blue', f'Epoch Progress')) for it, (batch) in pbar: # ---- Forward pass ---- batch = batch.to(self.model.device) output = self.model.remote_forward(self, batch, training=True) # ---- Backward pass ---- loss = output.local_target_loss + output.distillation_loss + output.remote_target_loss loss.backward() # ---- Gradient Step ---- clip_grad_norm_(self.model.parameters(), self.config.miner.clip_gradients) self.optimizer.step() self.optimizer.zero_grad() self.decay_learning_rate(batch) # Add losses up losses.append(loss.item()) llosses.append(output.local_target_loss.item()) rlosses.append(output.remote_target_loss.item()) dlosses.append(output.distillation_loss.item()) # ---- Train row weights ---- batch_weights = torch.mean(output.router.weights, axis = 0).to(self.model.device) # Average over batch. self.row = (1 - 0.03) * self.row + 0.03 * batch_weights # Moving avg update. self.row = F.normalize(self.row, p = 1, dim = 0) # Ensure normalization. # ---- Logging ---- index = self.metagraph.state.index_for_uid[self.metagraph.uid] pbar.set_infos({ 'GS': colored('{}'.format(self.global_step), 'red'), 'LS': colored('{}'.format(it), 'blue'), 'Epoch': colored('{}'.format(self.epoch+1), 'green'), 'L-loss': colored('{:.5f}'.format(output.local_target_loss.item()), 'red'), 'R-loss': colored('{:.5f}'.format(output.remote_target_loss.item()), 'blue'), 'D-loss': colored('{:.5f}'.format(output.distillation_loss.item()), 'green'), 'lr': colored('{:e}'.format(self.lr), 'white'), 'nPeers': self.metagraph.n, 'Stake(\u03C4)': float(self.metagraph.S[index]), 'Rank(\u03C4)': float(self.metagraph.R[index]), 'Incentive(\u03C4/block)': float(self.metagraph.I[index]), 'Axon': self.axon.__str__(), 'Dendrite': self.dendrite.__str__(), }) self.tensorboard.add_scalar('Neuron/Rloss', output.remote_target_loss.item(), self.global_step) self.tensorboard.add_scalar('Neuron/Lloss', output.local_target_loss.item(), self.global_step) self.tensorboard.add_scalar('Neuron/Dloss', output.distillation_loss.item(), self.global_step) self.global_step += 1 avg_loss = sum(losses) / len(losses) self.rloss = sum(rlosses) / len(rlosses) self.lloss = sum(llosses) / len(llosses) self.dloss = sum(dlosses) / len(dlosses) self.training_loss = avg_loss
def run( self ): r""" Miner main loop. """ # ---- Build Bittensor neuron ---- with self: if self.config.neuron.use_wandb: bittensor.wandb( config = self.config, cold_pubkey = self.wallet.coldkeypub.ss58_address, hot_pubkey = self.wallet.hotkey.ss58_address, root_dir = self.config.neuron.full_path ) # ---- Init run state ---- self.epoch = 0 # ---- reloads previous run if not restart ---- if self.config.neuron.no_restart: self.save() try: self.reload() self.axon.check() except Exception as e: logger.error("Error when trying to reload model: {}".format(e)) self.save() self.reload() self.axon.check() self.stats.ema_scores = torch.nn.Parameter(torch.ones(self.metagraph.n.item()).to(self.device) * (1 / self.metagraph.n.item()), requires_grad = False) # --- Run until n_epochs ---- while self.epoch < self.config.neuron.n_epochs: try: # --- Init epoch stat---- self.stats.epoch_data_size = 0 self.stats.epoch_sync_count = 0 total_local_target_epoch_loss = 0 total_distillation_epoch_loss = 0 total_remote_target_epoch_loss = 0 total_local_epoch_acc = 0 batches_count = 0 # ---- Run epoch ---- start_block = self.subtensor.get_current_block() + 1 end_block = start_block + self.config.neuron.epoch_length block_steps = [ block_delta for block_delta in range(start_block, end_block)] progress_bar = qqdm( block_steps, total=len(block_steps), desc=format_str('blue', f'Epoch:')) progress_bar.set_bar = partial(progress_bar.set_bar, element='#') for block in progress_bar: # --- Iterate over batches until the end of the block. current_block = self.subtensor.get_current_block() while block >= current_block: # ---- Forward pass ---- inputs = next( self.dataset ) output = self.nucleus.remote_forward ( inputs = inputs.to( self.device ), training = True, ) # ---- Backward pass ---- output.loss = output.local_target_loss + output.distillation_loss + output.remote_target_loss scores = torch.nn.functional.normalize ( torch.relu( self.nucleus.compute_scores(output.remote_target_loss) ), p=1, dim = 0 ) scores[output.query_uids] += 1e-6 output.loss.backward() # Accumulates gradients on the nucleus. clip_grad_norm_(self.nucleus.parameters(), self.config.neuron.clip_gradients) # ---- Apply and zero accumulated gradients. self.optimizer.step() self.optimizer.zero_grad() current_block = self.subtensor.get_current_block() # ---- Aggrigate outputs and losses total_local_target_epoch_loss += output.local_target_loss.item() total_distillation_epoch_loss += output.distillation_loss.item() total_remote_target_epoch_loss += output.remote_target_loss.item() total_local_epoch_acc += output.local_accuracy self.stats.epoch_data_size += inputs.nelement() batches_count += 1 # ---- Expand ema_scores tensor if the chain grew and aggrigate the score chain_growth = max(scores.shape[0] - self.stats.ema_scores.shape[0], 0) if chain_growth > 0: self.stats.ema_scores = torch.nn.Parameter(torch.cat( [self.stats.ema_scores, torch.zeros([chain_growth], dtype=torch.float32, device = self.device)]), requires_grad=False) self.stats.ema_scores = self.fisher_ema_decay * self.stats.ema_scores + (1 - self.fisher_ema_decay) * scores self.stats.scores = scores # ---- Sync with metagraph if the current block >= last synced block + sync block time current_block = self.subtensor.get_current_block() block_diff = current_block - self.stats.last_sync_block if block_diff >= self.config.neuron.sync_block_time: self.sync(current_block) self.stats.last_sync_block = current_block self.stats.epoch_sync_count += 1 # ---- Update the epoch loss if it is the last iteration within epoch if block+1 == end_block : self.stats.local_target_epoch_loss = total_local_target_epoch_loss / batches_count self.stats.distillation_epoch_loss = total_distillation_epoch_loss / batches_count self.stats.remote_target_epoch_loss = total_remote_target_epoch_loss / batches_count self.stats.local_epoch_acc = total_local_epoch_acc / batches_count # ---- Block logs. self.logs ( progress_bar, iteration = block-start_block, output = output, ) self.stats.global_step += 1 # ---- Update params ---- self.epoch += 1 # ---- Checkpoint state ---- self.checkpoint() except KeyboardInterrupt: # --- User ended session ---- break except Exception as e: # --- Unknown error ---- logger.exception('Unknown exception: {} with traceback {}', e, traceback.format_exc()) if self.config.neuron.restart_on_failure == True: logger.info('Restarting from last saved state.') self.reload() else: break
def train(num_epochs, learning_rate, train_dataloader, model_type): # Model model_classes = { 'resnet': Resnet(), 'fcn': fcn_autoencoder(), 'cnn': conv_autoencoder(), 'vae': VAE(), } model = model_classes[model_type].cuda() # Loss and optimizer criterion = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # Training loop best_loss = np.inf model.train() # learning rate schedule warm_up_with_cosine_lr = lambda epoch: epoch / config[ 'warm_up_epochs'] if epoch <= config['warm_up_epochs'] else 0.5 * ( math.cos((epoch - config['warm_up_epochs']) / (num_epochs - config['warm_up_epochs']) * math.pi) + 1) scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=warm_up_with_cosine_lr) qqdm_train = qqdm(range(num_epochs), desc=format_str('bold', 'Description')) for epoch in qqdm_train: tot_loss = list() for data in train_dataloader: # ===================loading===================== if model_type in ['cnn', 'vae', 'resnet']: img = data.float().cuda() elif model_type in ['fcn']: img = data.float().cuda() img = img.view(img.shape[0], -1) # ===================forward===================== output = model(img) if model_type in ['vae']: loss = loss_vae(output[0], img, output[1], output[2], criterion) else: loss = criterion(output, img) tot_loss.append(loss.item()) # ===================backward==================== optimizer.zero_grad() loss.backward() optimizer.step() # ===================save_best==================== mean_loss = np.mean(tot_loss) if mean_loss < best_loss: best_loss = mean_loss torch.save(model, 'best_{}.pt'.format(train_name)) # ===================log======================== qqdm_train.set_infos({ 'epoch': f'{epoch + 1:.0f}/{num_epochs:.0f}', 'loss': f'{mean_loss:.4f}', }) # warm up scheduler.step() realLearningRate = scheduler.get_last_lr()[0] # wandb wandb.log({ 'epoch': epoch + 1, 'train_loss': mean_loss, 'learningRate': realLearningRate }) # ===================save_last======================== torch.save(model, 'last_{}.pt'.format(train_name))
import time import random from qqdm import qqdm, format_str start = time.time() for ep in range(2): tw = qqdm(range(100), desc=format_str('blue', f'asdadsfd')) time.sleep(1) for i in tw: tw.set_infos({ 'loss': f'{random.random():.4f}', 'acc': f'{random.random():.4f}', }) time.sleep(.01) print('Done') time.sleep(2) # tw = qqdm(range(100), desc=format_str('blue', f'test enum')) time.sleep(.1) for i, item in qqdm(enumerate(range(100))): #, desc=format_str('blue', f'test enum')): tw.set_infos({ 'loss': f'{random.random():.4f}', 'acc': f'{random.random():.4f}', }) time.sleep(.01) print('Done') time.sleep(.4) for i, item in qqdm(enumerate(range(100)), desc=format_str('blue', f'test enum')):