Esempio n. 1
0
        def run_epoch():
            self.model.train(True)
            losses = []

            # Re-create dataloader every time we call train
            # This way, since epoch_length < len(dataset), we can
            # make sure that the dataset is randomly shuffled each time
            # we train for an epoch.
            logger.info("Preparing dataset batch...")
            dataset = self.shuffle_dataset_epoch_length()
            pbar = qqdm(enumerate(dataset), total=len(dataset), desc=format_str('blue', f'Epoch Progress'))

            for it, (batch) in pbar:
                batch = batch.to(self.model.device)
                output = self.model.remote_forward(self.neuron, batch, training=True)
                loss = output.local_target_loss + output.distillation_loss + output.remote_target_loss
                loss.backward()

                clip_grad_norm_(self.model.parameters(), self.config.miner.clip_gradients)
                self.optimizer.step()
                self.optimizer.zero_grad()

                self.decay_learning_rate(batch)

                losses.append(loss.item())

                 # ---- Train row weights ----
                batch_weights = torch.mean(output.router.weights, axis = 0).to(self.model.device) # Average over batch.
                self.row = (1 - 0.03) * self.row + 0.03 * batch_weights # Moving avg update.
                self.row = F.normalize(self.row, p = 1, dim = 0) # Ensure normalization.

                pbar.set_infos({
                    'GS': colored('{}'.format(self.global_step), 'red'),
                    'LS': colored('{}'.format(it), 'blue'),
                    'Epoch': colored('{}'.format(self.epoch+1), 'green'),
                    'Local loss': colored('{:.5f}'.format(output.local_target_loss.item()), 'red'),
                    'Remote loss': colored('{:.5f}'.format(output.remote_target_loss.item()), 'blue'),
                    'Distillation loss': colored('{:.5f}'.format(output.distillation_loss.item()), 'green'),
                    'Learning Rate:': colored('{:e}'.format(self.lr), 'white'),
                    'Axon': self.neuron.axon.__str__(),
                    'Dendrite': self.neuron.dendrite.__str__(),
                })

                self.tensorboard.add_scalar('Neuron/Rloss', output.remote_target_loss.item(), self.global_step)
                self.tensorboard.add_scalar('Neuron/Lloss', output.local_target_loss.item(), self.global_step)
                self.tensorboard.add_scalar('Neuron/Dloss', output.distillation_loss.item(), self.global_step)
                self.global_step += 1


            avg_loss = sum(losses) / len(losses)
            self.training_loss = avg_loss
Esempio n. 2
0
 def run_next_training_epoch(self, training_batches: List[dict]) -> float:
     r""" Called by miner.run(), calls training_call for passed batches.
         Args:
             training_batches (List[dict]):
                 Training batches as returned by get_epoch_batches.
     """
     total_epoch_loss = 0.0
     progress_bar = qqdm(enumerate(training_batches),
                         total=len(training_batches),
                         desc=format_str('blue', f'Epoch Progress'))
     for iteration, (training_batch) in progress_bar:
         output = self.training_call(batch=training_batch)
         total_epoch_loss += output.local_target_loss.item()
         self.epoch_loss = total_epoch_loss / (iteration + 1)
         self.global_step += 1
         self.training_logs(progress_bar,
                            iteration=iteration,
                            output=output)
Esempio n. 3
0
    def train_epoch(self, epoch, dataloader):

        epoch_losses = {
            "loss_epoch": 0.0,
            "loss_point_cloud_epoch": 0.0,
            "loss_field_of_view_epoch": 0.0,
            "loss_po2po_epoch": 0.0,
            "loss_po2pl_epoch": 0.0,
            "loss_pl2pl_epoch": 0.0,
            "visible_pixels_epoch": 0.0,
            "loss_yaw_pitch_roll_epoch": np.zeros(3),
            "loss_true_trafo_epoch": 0.0,
        }
        counter = 0

        qqdm_dataloader = qqdm.qqdm(dataloader,
                                    desc=qqdm.format_str(
                                        'blue', 'Epoch ' + str(epoch)))

        for preprocessed_dicts in qqdm_dataloader:
            # Load corresponnding preprocessed kd_tree
            for preprocessed_dict in preprocessed_dicts:
                # Move data to devices:
                for key in preprocessed_dict:
                    if hasattr(preprocessed_dict[key], 'to'):
                        preprocessed_dict[key] = preprocessed_dict[key].to(
                            self.device)

            self.optimizer.zero_grad()

            epoch_losses, _ = (self.step(
                preprocessed_dicts=preprocessed_dicts,
                epoch_losses=epoch_losses,
                log_images_bool=counter == self.steps_per_epoch - 1
                or counter == 0))

            # Plotting and logging --> only first one in batch
            preprocessed_data = preprocessed_dicts[0]
            # Plot at very beginning to see initial state of the network
            if epoch == 0 and counter == 0 and not self.config["po2po_alone"]:
                self.log_image(epoch=epoch,
                               string="_start" + "_" +
                               preprocessed_data["dataset"])

            qqdm_dataloader.set_infos({
                'loss':
                f'{float(epoch_losses["loss_epoch"] / (counter + 1)):.6f}',
                'loss_point_cloud':
                f'{float(epoch_losses["loss_point_cloud_epoch"] / (counter + 1)):.6f}',
                'loss_po2po':
                f'{float(epoch_losses["loss_po2po_epoch"] / (counter + 1)):.6f}',
                'loss_po2pl':
                f'{float(epoch_losses["loss_po2pl_epoch"] / (counter + 1)):.6f}',
                'loss_pl2pl':
                f'{float(epoch_losses["loss_pl2pl_epoch"] / (counter + 1)):.6f}',
                'visible_pixels':
                f'{float(epoch_losses["visible_pixels_epoch"] / (counter + 1)):.6f}'
            })

            counter += 1

        return epoch_losses
Esempio n. 4
0
def run( config , validator, subtensor, wallet, metagraph, dataset, device, uid, dendrite):
    
    print(config)
    config.to_defaults()
    validator = validator.to(device)
    optimizer = torch.optim.SGD(
        validator.parameters(),
        lr = config.neuron.learning_rate,
        momentum = config.neuron.momentum,
    )
    if config.wandb.api_key != 'default':
        # Create wandb for telemetry.
        bittensor.wandb(
            config = config,
            cold_pubkey = wallet.coldkeypub.ss58_address,
            hot_pubkey = wallet.hotkey.ss58_address,
            root_dir = config.neuron.full_path
        )

    # Optionally resume.
    if config.neuron.no_restart != True:
        try:
            validator.load_state_dict( torch.load("{}/validator.torch".format( config.neuron.full_path ))['validator'], strict=False )
        except Exception as e:
            logger.error('Error reloading model: {} '.format(e))

    # --- last sync block 
    last_sync_block = subtensor.get_current_block()

    # --- Run Forever.
    epoch = 0
    global_step = 0
    best_loss = math.inf
    ema_score_decay = 0.995
    ema_scores = torch.nn.Parameter(torch.zeros_like(validator.peer_weights, device = device) * (1 / metagraph.n.item()), requires_grad = False)

    while True:

        # --- Run epoch.
        start_block = subtensor.get_current_block() + 1
        end_block = start_block + config.neuron.blocks_per_epoch
        blocks = [ block for block in range(start_block, end_block) ]
        progress = qqdm( blocks, total=len(blocks), desc=format_str('white', f'Epoch'))
        progress.set_bar = partial(progress.set_bar,  element='#')

        # --- Reset the epoch logs
        total_epoch_score = torch.zeros(metagraph.n.item(), device = device)
        total_epoch_loss = 0
        batch_count = 0
        
        for block in progress:
            
            # --- Training step.
            current_block = subtensor.get_current_block()
            while block >= current_block:
                loss, _, query_uids = validator( next( dataset ) )
                val_score = validator.scores()
                scores = torch.nn.functional.normalize ( torch.relu( val_score ), p=1, dim = 0 )
                scores[query_uids] += 1e-6
                loss.backward()
                clip_grad_norm_(validator.parameters(), config.neuron.clip_gradients)
                optimizer.step()
                optimizer.zero_grad() 
                global_step += 1
                batch_count += 1
                total_epoch_score += scores.detach()
                total_epoch_loss += loss.item()
                ema_scores = (ema_score_decay * ema_scores) + (1 - ema_score_decay) * scores.detach()
                current_block = subtensor.get_current_block()

            # --- Step logs.
            info = {
                'Step': colored('{}'.format(global_step), 'red'),
                'Epoch': colored('{}'.format(epoch), 'yellow'),
                'Best-loss': colored('{:.4f}'.format(best_loss), 'green'),            
                'Loss': colored('{:.4f}'.format(loss.item()), 'blue'),            
                'nPeers': colored(metagraph.n.item(), 'red'),
                'Stake(\u03C4)': colored('{:.3f}'.format(metagraph.S[uid].item()), 'yellow'),
                'Rank(\u03C4)': colored('{:.3f}'.format(metagraph.R[uid].item()), 'green'),
                'Incentive(\u03C4/block)': colored('{:.6f}'.format(metagraph.I[uid].item()), 'blue'),
                'Dividends': colored('{:.4f}'.format(metagraph.D[ uid ].item()), 'red'),
                'Current Block': colored('{}'.format(block), 'yellow')
            }
            
            topk_scores, topk_idx = bittensor.unbiased_topk(ema_scores, 5, dim=0)
            for idx, ema_score in zip(topk_idx, topk_scores) :
                color =  'green' if scores[idx] - ema_score > 0 else 'red'
                info[f'uid_{idx.item()}'] = colored('{:.4f}'.format(ema_score), color) 
            
            
            progress.set_infos( info )
        
        # --- End of epoch
        # --- Set mechanism weights.
        inactive_uids = torch.where(metagraph.active == 0)[0]
        ema_scores[inactive_uids] = 0
        topk_scores, topk_uids = bittensor.unbiased_topk( ema_scores.detach().to('cpu'), k = min(config.neuron.n_topk_peer_weights, metagraph.n.item()))
        subtensor.set_weights(
            uids = topk_uids,
            weights = topk_scores,
            wait_for_inclusion = False,
            wallet = wallet,
        )

        # --- Log.
        epoch_loss = total_epoch_loss / batch_count
        active_uids = torch.where(metagraph.active > 0)[0]

        nn = subtensor.neuron_for_pubkey(wallet.hotkey.ss58_address)
                
        if config.wandb.api_key != 'default':
            wandb_data = {
                'stake': nn.stake,
                'dividends': nn.dividends,
                'epoch_loss': epoch_loss,
                'STD in scores': torch.std(ema_scores[active_uids]).item(),
            } 
            df = pandas.concat( [
                bittensor.utils.indexed_values_to_dataframe( prefix = 'fisher_ema_score', index = topk_uids, values = ema_scores ),
                dendrite.to_dataframe( metagraph = metagraph )
            ], axis = 1)
            df['uid'] = df.index
            wandb_dendrite = dendrite.to_wandb()
            wandb.log( {**wandb_data, **wandb_dendrite}, step = current_block )
            wandb.log( { 'stats': wandb.Table( dataframe = df ) }, step = current_block )

        # --- Save.
        if best_loss > epoch_loss : 
            best_loss = epoch_loss
            torch.save( { 'validator': validator.state_dict() }, "{}/validator.torch".format( config.neuron.full_path ))

        if current_block - last_sync_block > config.neuron.metagraph_sync:
            metagraph.sync()
            last_sync_block = current_block
            validator.sync_with_chain_state()
            chain_growth = max(0, metagraph.n.item() - torch.numel( ema_scores ))
            ema_scores = torch.nn.Parameter(torch.cat([ema_scores, torch.zeros([chain_growth], dtype=torch.float32, requires_grad=False, device = device)]))

        epoch += 1
Esempio n. 5
0
        def run_epoch():
            self.model.train(True)
            losses = []
            rlosses = []
            llosses = []
            dlosses = []

            # we train for an epoch.
            logger.info("Preparing dataset batch...")
            # Set up the dataloader
            dataloader = self.dataset.dataloader(self.config.miner.epoch_length)
            pbar = qqdm(enumerate(dataloader), total=len(dataloader), desc=format_str('blue', f'Epoch Progress'))
            for it, (batch) in pbar:
                # ---- Forward pass ----
                batch = batch.to(self.model.device)
                output = self.model.remote_forward(self, batch, training=True)

                # ---- Backward pass ----
                loss = output.local_target_loss + output.distillation_loss + output.remote_target_loss
                loss.backward()

                # ---- Gradient Step ----
                clip_grad_norm_(self.model.parameters(), self.config.miner.clip_gradients)
                self.optimizer.step()
                self.optimizer.zero_grad()
                self.decay_learning_rate(batch)

                # Add losses up
                losses.append(loss.item())
                llosses.append(output.local_target_loss.item())
                rlosses.append(output.remote_target_loss.item())
                dlosses.append(output.distillation_loss.item())

                # ---- Train row weights ----
                batch_weights = torch.mean(output.router.weights, axis = 0).to(self.model.device) # Average over batch.
                self.row = (1 - 0.03) * self.row + 0.03 * batch_weights # Moving avg update.
                self.row = F.normalize(self.row, p = 1, dim = 0) # Ensure normalization.

                # ---- Logging ----
                index = self.metagraph.state.index_for_uid[self.metagraph.uid]
                pbar.set_infos({
                    'GS': colored('{}'.format(self.global_step), 'red'),
                    'LS': colored('{}'.format(it), 'blue'),
                    'Epoch': colored('{}'.format(self.epoch+1), 'green'),
                    'L-loss': colored('{:.5f}'.format(output.local_target_loss.item()), 'red'),
                    'R-loss': colored('{:.5f}'.format(output.remote_target_loss.item()), 'blue'),
                    'D-loss': colored('{:.5f}'.format(output.distillation_loss.item()), 'green'),
                    'lr': colored('{:e}'.format(self.lr), 'white'),
                    'nPeers': self.metagraph.n,
                    'Stake(\u03C4)': float(self.metagraph.S[index]),
                    'Rank(\u03C4)': float(self.metagraph.R[index]),
                    'Incentive(\u03C4/block)': float(self.metagraph.I[index]),
                    'Axon': self.axon.__str__(),
                    'Dendrite': self.dendrite.__str__(),
                })
                self.tensorboard.add_scalar('Neuron/Rloss', output.remote_target_loss.item(), self.global_step)
                self.tensorboard.add_scalar('Neuron/Lloss', output.local_target_loss.item(), self.global_step)
                self.tensorboard.add_scalar('Neuron/Dloss', output.distillation_loss.item(), self.global_step)
                self.global_step += 1


            avg_loss = sum(losses) / len(losses)
            self.rloss = sum(rlosses) / len(rlosses)
            self.lloss = sum(llosses) / len(llosses)
            self.dloss = sum(dlosses) / len(dlosses)

            self.training_loss = avg_loss
Esempio n. 6
0
    def run( self ):
        r""" Miner main loop.
        """
        # ---- Build Bittensor neuron ----
        with self:
            if self.config.neuron.use_wandb:
                bittensor.wandb(
                    config = self.config,
                    cold_pubkey = self.wallet.coldkeypub.ss58_address,
                    hot_pubkey = self.wallet.hotkey.ss58_address,
                    root_dir = self.config.neuron.full_path
                )

            # ---- Init run state ----
            self.epoch = 0   

            # ---- reloads previous run if not restart ----
            if self.config.neuron.no_restart:
                self.save()

            try:
                self.reload()
                self.axon.check()
            except Exception as e:
                logger.error("Error when trying to reload model: {}".format(e))
                self.save()
                self.reload()
                self.axon.check()
            
            self.stats.ema_scores = torch.nn.Parameter(torch.ones(self.metagraph.n.item()).to(self.device) * (1 / self.metagraph.n.item()), requires_grad = False)

            # --- Run until n_epochs ----
            while self.epoch < self.config.neuron.n_epochs:
                try:
                    # --- Init epoch stat----
                    self.stats.epoch_data_size = 0
                    self.stats.epoch_sync_count = 0
                    total_local_target_epoch_loss = 0
                    total_distillation_epoch_loss = 0
                    total_remote_target_epoch_loss = 0
                    total_local_epoch_acc = 0
                    batches_count = 0

                    # ---- Run epoch ----
                    start_block = self.subtensor.get_current_block() + 1
                    end_block = start_block + self.config.neuron.epoch_length
                    block_steps = [ block_delta for block_delta in range(start_block, end_block)]
                    progress_bar = qqdm( block_steps, total=len(block_steps), desc=format_str('blue', f'Epoch:'))
                    progress_bar.set_bar = partial(progress_bar.set_bar,  element='#')
                    for block in progress_bar:

                        # --- Iterate over batches until the end of the block.
                        current_block = self.subtensor.get_current_block()
                        while block >= current_block:
                            # ---- Forward pass ----
                            inputs = next( self.dataset )
                            output = self.nucleus.remote_forward (
                                inputs = inputs.to( self.device ),
                                training = True,
                            )
                            
                            # ---- Backward pass ----
                            output.loss = output.local_target_loss + output.distillation_loss + output.remote_target_loss
                            scores = torch.nn.functional.normalize ( torch.relu( self.nucleus.compute_scores(output.remote_target_loss) ), p=1, dim = 0 )
                            scores[output.query_uids] += 1e-6

                            output.loss.backward() # Accumulates gradients on the nucleus.
                            clip_grad_norm_(self.nucleus.parameters(), self.config.neuron.clip_gradients)
                            
                            # ---- Apply and zero accumulated gradients.
                            self.optimizer.step() 
                            self.optimizer.zero_grad()
                            current_block = self.subtensor.get_current_block()
                            
                            # ---- Aggrigate outputs and losses 
                            total_local_target_epoch_loss += output.local_target_loss.item()
                            total_distillation_epoch_loss += output.distillation_loss.item()
                            total_remote_target_epoch_loss += output.remote_target_loss.item()
                            total_local_epoch_acc += output.local_accuracy
                            self.stats.epoch_data_size += inputs.nelement()
                            batches_count += 1
                            
                            # ---- Expand ema_scores tensor if the chain grew and aggrigate the score
                            chain_growth = max(scores.shape[0] - self.stats.ema_scores.shape[0], 0)
                            if chain_growth > 0:
                                self.stats.ema_scores = torch.nn.Parameter(torch.cat( [self.stats.ema_scores, torch.zeros([chain_growth], dtype=torch.float32, device = self.device)]), requires_grad=False)
                            self.stats.ema_scores = self.fisher_ema_decay * self.stats.ema_scores + (1 - self.fisher_ema_decay) * scores
                            self.stats.scores = scores


                        # ---- Sync with metagraph if the current block >= last synced block + sync block time 
                        current_block = self.subtensor.get_current_block()
                        block_diff = current_block - self.stats.last_sync_block
                        if block_diff >= self.config.neuron.sync_block_time:
                            self.sync(current_block)                                                                                                                
                            self.stats.last_sync_block = current_block
                            self.stats.epoch_sync_count += 1
                            
                        # ---- Update the epoch loss if it is the last iteration within epoch
                        if block+1 == end_block :
                            self.stats.local_target_epoch_loss = total_local_target_epoch_loss / batches_count
                            self.stats.distillation_epoch_loss = total_distillation_epoch_loss / batches_count
                            self.stats.remote_target_epoch_loss = total_remote_target_epoch_loss / batches_count
                            self.stats.local_epoch_acc = total_local_epoch_acc / batches_count

                        # ---- Block logs.
                        self.logs (
                            progress_bar,
                            iteration = block-start_block,
                            output = output,
                        )
                        self.stats.global_step += 1

                    # ---- Update params ----
                    self.epoch += 1

                    # ---- Checkpoint state ----
                    self.checkpoint()

                except KeyboardInterrupt:
                    # --- User ended session ----
                    break

                except Exception as e:
                    # --- Unknown error ----
                    logger.exception('Unknown exception: {} with traceback {}', e, traceback.format_exc())
                    if self.config.neuron.restart_on_failure == True:
                        logger.info('Restarting from last saved state.')
                        self.reload()
                    else:
                        break
Esempio n. 7
0
def train(num_epochs, learning_rate, train_dataloader, model_type):
    # Model
    model_classes = {
        'resnet': Resnet(),
        'fcn': fcn_autoencoder(),
        'cnn': conv_autoencoder(),
        'vae': VAE(),
    }
    model = model_classes[model_type].cuda()

    # Loss and optimizer
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    best_loss = np.inf
    model.train()

    # learning rate schedule
    warm_up_with_cosine_lr = lambda epoch: epoch / config[
        'warm_up_epochs'] if epoch <= config['warm_up_epochs'] else 0.5 * (
            math.cos((epoch - config['warm_up_epochs']) /
                     (num_epochs - config['warm_up_epochs']) * math.pi) + 1)
    scheduler = torch.optim.lr_scheduler.LambdaLR(
        optimizer, lr_lambda=warm_up_with_cosine_lr)

    qqdm_train = qqdm(range(num_epochs),
                      desc=format_str('bold', 'Description'))
    for epoch in qqdm_train:
        tot_loss = list()
        for data in train_dataloader:

            # ===================loading=====================
            if model_type in ['cnn', 'vae', 'resnet']:
                img = data.float().cuda()
            elif model_type in ['fcn']:
                img = data.float().cuda()
                img = img.view(img.shape[0], -1)

            # ===================forward=====================
            output = model(img)
            if model_type in ['vae']:
                loss = loss_vae(output[0], img, output[1], output[2],
                                criterion)
            else:
                loss = criterion(output, img)

            tot_loss.append(loss.item())

            # ===================backward====================
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # ===================save_best====================
        mean_loss = np.mean(tot_loss)
        if mean_loss < best_loss:
            best_loss = mean_loss
            torch.save(model, 'best_{}.pt'.format(train_name))
        # ===================log========================
        qqdm_train.set_infos({
            'epoch': f'{epoch + 1:.0f}/{num_epochs:.0f}',
            'loss': f'{mean_loss:.4f}',
        })

        # warm up
        scheduler.step()
        realLearningRate = scheduler.get_last_lr()[0]

        # wandb
        wandb.log({
            'epoch': epoch + 1,
            'train_loss': mean_loss,
            'learningRate': realLearningRate
        })
        # ===================save_last========================
        torch.save(model, 'last_{}.pt'.format(train_name))
Esempio n. 8
0
import time
import random

from qqdm import qqdm, format_str

start = time.time()

for ep in range(2):
    tw = qqdm(range(100), desc=format_str('blue', f'asdadsfd'))
    time.sleep(1)
    for i in tw:
        tw.set_infos({
            'loss': f'{random.random():.4f}',
            'acc': f'{random.random():.4f}',
        })
        time.sleep(.01)
    print('Done')
    time.sleep(2)

# tw = qqdm(range(100), desc=format_str('blue', f'test enum'))
time.sleep(.1)
for i, item in qqdm(enumerate(range(100))): #, desc=format_str('blue', f'test enum')):
    tw.set_infos({
        'loss': f'{random.random():.4f}',
        'acc': f'{random.random():.4f}',
    })
    time.sleep(.01)
print('Done')
time.sleep(.4)

for i, item in qqdm(enumerate(range(100)), desc=format_str('blue', f'test enum')):