Beispiel #1
0
    def __init__(self, config: Munch = None, **kwargs):
        if config == None:
            config = Miner.default_config();       
        bittensor.config.Config.update_with_kwargs(config.miner, kwargs) 
        Miner.check_config(config)
        self.config = config

        # ---- Neuron ----
        self.neuron = bittensor.neuron.Neuron(self.config)

        # ---- Model ----
        self.model = BertNSPSynapse( self.config )

        # ---- Optimizer ----
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr = self.config.miner.learning_rate, momentum=self.config.miner.momentum)
        self.scheduler = WarmupCosineWithHardRestartsSchedule(self.optimizer, 50, 300)

        # ---- Model Load/Save tools ----
        self.model_toolbox = ModelToolbox(BertNSPSynapse, torch.optim.SGD)

        # ---- Dataset ----
        # Dataset: News headlines
        self.dataset = load_dataset('ag_news')['train']


        # ---- Logging ----
        self.tensorboard = SummaryWriter(log_dir = self.config.miner.full_path)
        if self.config.miner.record_log:
            logger.add(self.config.miner.full_path + "/{}_{}.log".format(self.config.miner.name, self.config.miner.trial_uid),format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}")
Beispiel #2
0
    def __init__(self, config: Munch = None, **kwargs):
        if config == None:
            config = Miner.default_config()
        bittensor.config.Config.update_with_kwargs(config.miner, kwargs)
        Miner.check_config(config)
        self.config = config

        # ---- Model ----
        self.model = GPT2Synapse(self.config)

        # ---- Model Load/Save tools ----
        self.model_toolbox = ModelToolbox(GPT2Synapse, AdamW)

        # ---- Optimizer ----
        self.optimizer = self.configure_optimizers()
        self.lr = self.config.miner.learning_rate
        self.training_loss = math.inf
        self.best_train_loss = math.inf
        self.rloss = math.inf
        self.lloss = math.inf
        self.dloss = math.inf

        # ---- Dataset ----
        # The Genesis Dataset:
        # The dataset used to train Adam and his first 100 children.
        self.dataset = GenesisTextDataloader(
            self.config.miner.batch_size_train, self.model.get_block_size())
        self.tokens = 0
        super(Miner, self).__init__(self.config, **kwargs)
Beispiel #3
0
    def __init__(self, config: Munch = None):
        if config == None:
            config = Miner.build_config(); logger.info(bittensor.config.Config.toString(config))
        self.config = config

        # ---- Neuron ----
        self.neuron = bittensor.neuron.Neuron(self.config)

        # ---- Model ----
        self.model = BertMLMSynapse( self.config )

        # ---- Optimizer ----
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr = self.config.miner.learning_rate, momentum=self.config.miner.momentum)
        self.scheduler = WarmupCosineWithHardRestartsSchedule(self.optimizer, 50, 300)

        # ---- Model Load/Save tools ----
        self.model_toolbox = ModelToolbox(BertMLMSynapse, torch.optim.SGD)

        # ---- Dataset ----
        # Dataset: 74 million sentences pulled from books.
        self.dataset = load_dataset('ag_news')['train']
        # The collator accepts a list [ dict{'input_ids, ...; } ] where the internal dict 
        # is produced by the tokenizer.
        self.data_collator = DataCollatorForLanguageModeling (
            tokenizer=bittensor.__tokenizer__(), mlm=True, mlm_probability=0.15
        )

        # ---- Logging ----
        self.tensorboard = SummaryWriter(log_dir = self.config.miner.full_path)
        if self.config.miner.record_log:
            logger.add(self.config.miner.full_path + "/{}_{}.log".format(self.config.miner.name, self.config.miner.trial_uid),format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}")
Beispiel #4
0
    def __init__(self, config: Munch = None, **kwargs):
        if config == None:
            config = Miner.default_config()
        bittensor.config.Config.update_with_kwargs(config.miner, kwargs)
        Miner.check_config(config)
        self.config = config

        # ---- Neuron ----
        self.neuron = bittensor.neuron.Neuron(self.config)

        # ---- Model ----
        self.model = GPT2LMSynapse(self.config)

        # ---- Optimizer ----
        self.optimizer = torch.optim.SGD(self.model.parameters(),
                                         lr=self.config.miner.learning_rate,
                                         momentum=self.config.miner.momentum)
        self.scheduler = WarmupCosineWithHardRestartsSchedule(
            self.optimizer, 50, 300)

        # ---- Model Load/Save tools ----
        self.model_toolbox = ModelToolbox(GPT2LMSynapse, torch.optim.SGD)

        # ---- Dataset ----
        # The Genesis Dataset:
        # The dataset used to train Adam and his first 100 children.
        self.dataset = AdamCorpus(self.config.miner.custom_dataset)

        # ---- Logging ----
        self.tensorboard = SummaryWriter(log_dir=self.config.miner.full_path)
        if self.config.miner.record_log:
            logger.add(
                self.config.miner.full_path + "/{}_{}.log".format(
                    self.config.miner.name, self.config.miner.trial_uid),
                format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}")
Beispiel #5
0
    def __init__(self, config: Munch = None):
        if config == None:
            config = Miner.build_config(); logger.info(bittensor.config.Config.toString(config))
        self.config = config

        # ---- Neuron ----
        self.neuron = bittensor.neuron.Neuron(self.config)

        # ---- Model ----
        self.model = GPT2LMSynapse( self.config )

        # ---- Optimizer ----
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr = self.config.miner.learning_rate, momentum=self.config.miner.momentum)
        self.scheduler = WarmupCosineWithHardRestartsSchedule(self.optimizer, 50, 300)

        # ---- Model Load/Save tools ----
        self.model_toolbox = ModelToolbox(GPT2LMSynapse, torch.optim.SGD)

        # ---- Dataset ----
        # Dataset: 74 million sentences pulled from books.
        self.dataset = load_dataset('ag_news')['train']

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # ---- Logging ----
        self.tensorboard = SummaryWriter(log_dir = self.config.miner.full_path)
        if self.config.miner.record_log:
            logger.add(self.config.miner.full_path + "/{}_{}.log".format(self.config.miner.name, self.config.miner.trial_uid),format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}")
Beispiel #6
0
    def __init__(self, config: Munch = None):
        if config == None:
            config = Miner.build_config()
            logger.info(bittensor.config.Config.toString(config))
        self.config = config

        # ---- Build Neuron ----
        self.neuron = bittensor.neuron.Neuron(config)

        # ---- Build FFNN Model ----
        self.model = FFNNSynapse(self.config)
        self.model.to(
            torch.device("cuda" if torch.cuda.is_available() else "cpu"))
        self.neuron.axon.serve(self.model)

        # ---- Optimizer ----
        self.optimizer = torch.optim.SGD(self.model.parameters(),
                                         lr=self.config.miner.learning_rate,
                                         momentum=self.config.miner.momentum)

        # ---- Model Load/Save tools ----
        self.model_toolbox = ModelToolbox(FFNNSynapse, torch.optim.SGD)

        # ---- Logging ----
        self.tensorboard = SummaryWriter(log_dir=self.config.miner.full_path)
        if self.config.miner.record_log:
            logger.add(
                self.config.miner.full_path + "/{}_{}.log".format(
                    self.config.miner.name, self.config.miner.trial_uid),
                format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}")
Beispiel #7
0
    def __init__(self, config: Munch = None, **kwargs):
        if config == None:
            config = Miner.default_config()
        bittensor.config.Config.update_with_kwargs(config.miner, kwargs)
        Miner.check_config(config)
        self.config = config

        # ---- Neuron ----
        self.neuron = bittensor.neuron.Neuron(self.config)

        # ---- Model ----
        self.model = FFNNSynapse(
            config)  # Feedforward neural network with PKMRouter.
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)  # Set model to device

        # ---- Optimizer ----
        self.optimizer = optim.SGD(self.model.parameters(),
                                   lr=self.config.miner.learning_rate,
                                   momentum=self.config.miner.momentum)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer,
                                                         step_size=10.0,
                                                         gamma=0.1)

        # ---- Model Load/Save tools ----
        self.model_toolbox = ModelToolbox(FFNNSynapse, optim.SGD)

        # ---- Dataset ----
        self.train_data = torchvision.datasets.MNIST(
            root=self.config.miner.root_dir + "datasets/",
            train=True,
            download=True,
            transform=transforms.ToTensor())
        self.trainloader = torch.utils.data.DataLoader(
            self.train_data,
            batch_size=self.config.miner.batch_size_train,
            shuffle=True,
            num_workers=2)
        self.test_data = torchvision.datasets.MNIST(
            root=self.config.miner.root_dir + "datasets/",
            train=False,
            download=True,
            transform=transforms.ToTensor())
        self.testloader = torch.utils.data.DataLoader(
            self.test_data,
            batch_size=self.config.miner.batch_size_test,
            shuffle=False,
            num_workers=2)

        # ---- Tensorboard ----
        self.global_step = 0
        self.tensorboard = SummaryWriter(log_dir=self.config.miner.full_path)
        if self.config.miner.record_log:
            logger.add(
                self.config.miner.full_path + "/{}_{}.log".format(
                    self.config.miner.name, self.config.miner.trial_uid),
                format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}")
Beispiel #8
0
    def __init__(self, config: Munch = None, **kwargs):
        if config == None:
            config = Miner.default_config()
        bittensor.config.Config.update_with_kwargs(config.miner, kwargs)
        Miner.check_config(config)
        self.config = config

        # ---- Neuron ----
        self.neuron = bittensor.neuron.Neuron(self.config)

        # ---- Model ----
        self.model = XLMSynapse(self.config)

        # ---- Optimizer ----
        self.optimizer = torch.optim.SGD(self.model.parameters(),
                                         lr=self.config.miner.learning_rate,
                                         momentum=self.config.miner.momentum)
        self.scheduler = WarmupCosineWithHardRestartsSchedule(
            self.optimizer, 50, 300)

        # ---- Model Load/Save tools ----
        self.model_toolbox = ModelToolbox(XLMSynapse, torch.optim.SGD)

        # ---- Dataset ----
        # Dataset: 74 million sentences pulled from books.
        self.dataset = load_dataset('amazon_reviews_multi', 'en')['train']

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        if self.config.synapse.device:
            self.device = torch.device(self.config.synapse.device)

        # ---- Logging ----
        self.tensorboard = SummaryWriter(log_dir=self.config.miner.full_path)
        if self.config.miner.record_log == True:
            filepath = f"{self.config.miner.full_path}/{self.config.miner.name}_ {self.config.miner.trial_uid}.log"
            logger.add(
                filepath,
                format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}",
                rotation="250 MB",
                retention="10 days")
Beispiel #9
0
    def __init__(self, config: Munch = None, **kwargs):
        if config == None:
            config = Miner.default_config()
        bittensor.config.Config.update_with_kwargs(config.miner, kwargs)
        Miner.check_config(config)
        self.config = config

        # ---- Neuron ----
        self.neuron = bittensor.neuron.Neuron(self.config)

        # ---- Model ----
        self.model = GPT2Synapse( self.config )

        # ---- Model Load/Save tools ----
        self.model_toolbox = ModelToolbox(GPT2Synapse, AdamW)

        # ---- Optimizer ----
        self.optimizer = self.configure_optimizers()
        self.lr = self.config.miner.learning_rate
        self.training_loss = math.inf
        self.best_train_loss = math.inf

        # ---- Dataset ----
        # The Genesis Dataset:
        # The dataset used to train Adam and his first 100 children.
        # Here block size = sequence length.
        self.dataset = AdamCorpus(self.model.get_block_size())
        self.tokens = 0

        # ---- Logging ----
        self.tensorboard = SummaryWriter(log_dir = self.config.miner.full_path)
        if self.config.miner.record_log == True:
            filepath = self.config.miner.full_path + "/{}_{}.log".format(self.config.miner.name, self.config.miner.trial_uid),
            logger.add (
                filepath,
                format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}",
                rotation="250 MB",
                retention="10 days"
            )
Beispiel #10
0
class Miner():
    """
    Initializes, trains, and tests models created inside of 'bittensor/synapses'. 
    During instantiation, this class takes a config as a [Munch](https://github.com/Infinidat/munch) object. 
    """
    def __init__(self, config: Munch = None, **kwargs):
        if config == None:
            config = Miner.default_config()
        bittensor.config.Config.update_with_kwargs(config.miner, kwargs)
        Miner.check_config(config)
        self.config = config

        # ---- Neuron ----
        self.neuron = bittensor.neuron.Neuron(self.config)

        # ---- Model ----
        self.model = XLMSynapse(self.config)

        # ---- Optimizer ----
        self.optimizer = torch.optim.SGD(self.model.parameters(),
                                         lr=self.config.miner.learning_rate,
                                         momentum=self.config.miner.momentum)
        self.scheduler = WarmupCosineWithHardRestartsSchedule(
            self.optimizer, 50, 300)

        # ---- Model Load/Save tools ----
        self.model_toolbox = ModelToolbox(XLMSynapse, torch.optim.SGD)

        # ---- Dataset ----
        # Dataset: 74 million sentences pulled from books.
        self.dataset = load_dataset('amazon_reviews_multi', 'en')['train']

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        if self.config.synapse.device:
            self.device = torch.device(self.config.synapse.device)

        # ---- Logging ----
        self.tensorboard = SummaryWriter(log_dir=self.config.miner.full_path)
        if self.config.miner.record_log == True:
            filepath = f"{self.config.miner.full_path}/{self.config.miner.name}_ {self.config.miner.trial_uid}.log"
            logger.add(
                filepath,
                format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}",
                rotation="250 MB",
                retention="10 days")

    @staticmethod
    def default_config() -> Munch:
        parser = argparse.ArgumentParser()
        Miner.add_args(parser)
        config = bittensor.config.Config.to_config(parser)
        return config

    @staticmethod
    def add_args(parser: argparse.ArgumentParser):
        parser.add_argument('--miner.learning_rate',
                            default=0.01,
                            type=float,
                            help='Training initial learning rate.')
        parser.add_argument('--miner.momentum',
                            default=0.98,
                            type=float,
                            help='Training initial momentum for SGD.')
        parser.add_argument('--miner.n_epochs',
                            default=int(sys.maxsize),
                            type=int,
                            help='Number of training epochs.')
        parser.add_argument('--miner.epoch_length',
                            default=500,
                            type=int,
                            help='Iterations of training per epoch')
        parser.add_argument('--miner.batch_size_train',
                            default=1,
                            type=int,
                            help='Training batch size.')
        parser.add_argument(
            '--miner.sync_interval',
            default=100,
            type=int,
            help='Batches before we sync with chain and emit new weights.')
        parser.add_argument('--miner.log_interval',
                            default=10,
                            type=int,
                            help='Batches before we log miner info.')
        parser.add_argument(
            '--miner.accumulation_interval',
            default=1,
            type=int,
            help='Batches before we apply acummulated gradients.')
        parser.add_argument(
            '--miner.apply_remote_gradients',
            default=False,
            type=bool,
            help=
            'If true, neuron applies gradients which accumulate from remotes calls.'
        )
        parser.add_argument(
            '--miner.root_dir',
            default='~/.bittensor/miners/',
            type=str,
            help='Root path to load and save data associated with each miner')
        parser.add_argument(
            '--miner.name',
            default='xlm_wiki',
            type=str,
            help='Trials for this miner go in miner.root / miner.name')
        parser.add_argument(
            '--miner.trial_uid',
            default=str(time.time()).split('.')[0],
            type=str,
            help='Saved models go in miner.root_dir / miner.name / miner.uid')
        parser.add_argument('--miner.record_log',
                            default=False,
                            help='Record all logs when running this miner')
        parser.add_argument(
            '--miner.config_file',
            type=str,
            help=
            'config file to run this neuron, if not using cmd line arguments.')
        parser.add_argument('--debug',
                            dest='debug',
                            action='store_true',
                            help='''Turn on bittensor debugging information''')
        parser.set_defaults(debug=False)
        XLMSynapse.add_args(parser)
        bittensor.neuron.Neuron.add_args(parser)

    @staticmethod
    def check_config(config: Munch):
        if config.debug:
            bittensor.__log_level__ = 'TRACE'
            logger.debug('DEBUG is ON')
        else:
            logger.info('DEBUG is OFF')
        assert config.miner.momentum > 0 and config.miner.momentum < 1, "momentum must be a value between 0 and 1"
        assert config.miner.batch_size_train > 0, "batch_size_train must be a positive value"
        assert config.miner.learning_rate > 0, "learning_rate must be a positive value."
        full_path = '{}/{}/{}'.format(config.miner.root_dir, config.miner.name,
                                      config.miner.trial_uid)
        config.miner.full_path = os.path.expanduser(full_path)
        if not os.path.exists(config.miner.full_path):
            os.makedirs(config.miner.full_path)

    # --- Main loop ----
    def run(self):

        # ---- Subscribe ----
        with self.neuron:

            # ---- Weights ----
            self.row = self.neuron.metagraph.row.to(self.model.device)

            # --- Run state ---
            self.global_step = 0
            self.best_train_loss = math.inf

            # --- Loop for epochs ---
            for self.epoch in range(self.config.miner.n_epochs):
                try:
                    # ---- Serve ----
                    self.neuron.axon.serve(self.model)

                    # ---- Train Model ----
                    self.train()
                    self.scheduler.step()

                    # If model has borked for some reason, we need to make sure it doesn't emit weights
                    # Instead, reload into previous version of model
                    if torch.any(
                            torch.isnan(
                                torch.cat([
                                    param.view(-1)
                                    for param in self.model.parameters()
                                ]))):
                        self.model, self.optimizer = self.model_toolbox.load_model(
                            self.config)
                        continue

                    # ---- Emitting weights ----
                    self.neuron.metagraph.set_weights(
                        self.row, wait_for_inclusion=True
                    )  # Sets my row-weights on the chain.

                    # ---- Sync metagraph ----
                    self.neuron.metagraph.sync(
                    )  # Pulls the latest metagraph state (with my update.)
                    self.row = self.neuron.metagraph.row.to(self.model.device)

                    # --- Epoch logs ----
                    print(self.neuron.axon.__full_str__())
                    print(self.neuron.dendrite.__full_str__())
                    print(self.neuron.metagraph)

                    # ---- Update Tensorboard ----
                    self.neuron.dendrite.__to_tensorboard__(
                        self.tensorboard, self.global_step)
                    self.neuron.metagraph.__to_tensorboard__(
                        self.tensorboard, self.global_step)
                    self.neuron.axon.__to_tensorboard__(
                        self.tensorboard, self.global_step)

                    # ---- Save best loss and model ----
                    if self.training_loss and self.epoch % 10 == 0 and self.training_loss < self.best_train_loss:
                        self.best_train_loss = self.training_loss / 10  # update best train loss
                        self.model_toolbox.save_model(
                            self.config.miner.full_path, {
                                'epoch':
                                self.epoch,
                                'model_state_dict':
                                self.model.state_dict(),
                                'loss':
                                self.best_train_loss,
                                'optimizer_state_dict':
                                self.optimizer.state_dict(),
                            })
                        self.tensorboard.add_scalar('Neuron/Train_loss',
                                                    self.training_loss,
                                                    self.global_step)

                # --- Catch Errors ----
                except Exception as e:
                    logger.error(
                        'Exception in training script with error: {}, {}', e,
                        traceback.format_exc())
                    logger.info('Continuing to train.')

    # ---- Train Epoch ----
    def train(self):
        self.training_loss = 0.0
        for local_step in range(self.config.miner.epoch_length):
            # ---- Forward pass ----
            inputs = nextbatch(self.dataset,
                               self.config.miner.batch_size_train,
                               bittensor.__tokenizer__())
            output = self.model.remote_forward(
                self.neuron,
                inputs.to(self.model.device),
                training=True,
            )

            # ---- Backward pass ----
            loss = output.local_target_loss + output.distillation_loss + output.remote_target_loss
            loss.backward()  # Accumulates gradients on the model.
            self.optimizer.step()  # Applies accumulated gradients.
            self.optimizer.zero_grad(
            )  # Zeros out gradients for next accummulation

            # ---- Train row weights ----
            batch_weights = torch.mean(output.router.weights, axis=0).to(
                self.model.device)  # Average over batch.
            self.row = (
                1 -
                0.03) * self.row + 0.03 * batch_weights  # Moving avg update.
            self.row = F.normalize(self.row, p=1,
                                   dim=0)  # Ensure normalization.

            # ---- Step logs ----
            logger.info(
                'GS: {} LS: {} Epoch: {}\tLocal Target Loss: {}\tRemote Target Loss: {}\tDistillation Loss: {}\tAxon: {}\tDendrite: {}',
                colored('{}'.format(self.global_step), 'red'),
                colored('{}'.format(local_step), 'blue'),
                colored('{}'.format(self.epoch), 'green'),
                colored('{:.4f}'.format(output.local_target_loss.item()),
                        'green'),
                colored('{:.4f}'.format(output.remote_target_loss.item()),
                        'blue'),
                colored('{:.4f}'.format(output.distillation_loss.item()),
                        'red'), self.neuron.axon, self.neuron.dendrite)
            logger.info('Codes: {}', output.router.return_codes.tolist())

            self.tensorboard.add_scalar('Neuron/Rloss',
                                        output.remote_target_loss.item(),
                                        self.global_step)
            self.tensorboard.add_scalar('Neuron/Lloss',
                                        output.local_target_loss.item(),
                                        self.global_step)
            self.tensorboard.add_scalar('Neuron/Dloss',
                                        output.distillation_loss.item(),
                                        self.global_step)

            # ---- Step increments ----
            self.global_step += 1
            self.training_loss += output.local_target_loss.item()

            # --- Memory clean up ----
            torch.cuda.empty_cache()
            del output
Beispiel #11
0
class Miner():

    def __init__(self, config: Munch = None, **kwargs):
        if config == None:
            config = Miner.default_config()
        bittensor.config.Config.update_with_kwargs(config.miner, kwargs)
        Miner.check_config(config)
        self.config = config

        # ---- Neuron ----
        self.neuron = bittensor.neuron.Neuron(self.config)

        # ---- Model ----
        self.model = GPT2Synapse( self.config )

        # ---- Model Load/Save tools ----
        self.model_toolbox = ModelToolbox(GPT2Synapse, AdamW)

        # ---- Optimizer ----
        self.optimizer = self.configure_optimizers()
        self.lr = self.config.miner.learning_rate
        self.training_loss = math.inf
        self.best_train_loss = math.inf

        # ---- Dataset ----
        # The Genesis Dataset:
        # The dataset used to train Adam and his first 100 children.
        # Here block size = sequence length.
        self.dataset = AdamCorpus(self.model.get_block_size())
        self.tokens = 0

        # ---- Logging ----
        self.tensorboard = SummaryWriter(log_dir = self.config.miner.full_path)
        if self.config.miner.record_log == True:
            filepath = self.config.miner.full_path + "/{}_{}.log".format(self.config.miner.name, self.config.miner.trial_uid),
            logger.add (
                filepath,
                format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}",
                rotation="250 MB",
                retention="10 days"
            )
               
    @staticmethod
    def default_config() -> Munch:
        parser = argparse.ArgumentParser()
        Miner.add_args(parser)
        config = bittensor.config.Config.to_config(parser)
        return config

    @staticmethod
    def add_args(parser: argparse.ArgumentParser):
        parser.add_argument('--miner.learning_rate', default=3e-2, type=float, help='Training initial learning rate.')
        parser.add_argument('--miner.weight_decay', default=0.25, help='Model parameter weight decay.')
        parser.add_argument('--miner.lr_decay', default=True, help='learning rate decay params: linear warmup followed by cosine decay to 10% of original.')
        parser.add_argument('--miner.warmup_tokens', default=375e6, help='A linear LR warmup over the first miner.warmup_tokens tokens (default is 365 million)')
        parser.add_argument('--miner.final_tokens', default=260e9, help='At what point we reach 10% of original LR')
        parser.add_argument('--miner.num_workers', default=1, help='Number of workers for data loader.')
        parser.add_argument('--miner.clip_gradients', default=1.0, type=float, help='Implement gradient clipping to avoid exploding loss on smaller architectures.')
        parser.add_argument('--miner.n_epochs', default=int(sys.maxsize), type=int, help='Number of training epochs.')
        parser.add_argument('--miner.epoch_length', default=500, type=int, help='Iterations of training per epoch')
        parser.add_argument('--miner.batch_size_train', default=2, type=int, help='Training batch size.')
        parser.add_argument('--miner.root_dir', default='~/.bittensor/miners/', type=str,  help='Root path to load and save data associated with each miner')
        parser.add_argument('--miner.name', default='gpt2-genesis', type=str, help='Trials for this miner go in miner.root / miner.name')
        parser.add_argument('--miner.trial_uid', default=str(time.time()).split('.')[0], type=str, help='Saved models go in miner.root_dir / miner.name / miner.uid')
        parser.add_argument('--miner.record_log', default=False, help='Record all logs when running this miner')
        parser.add_argument('--miner.custom_dataset', default="~/.bittensor/bittensor/miners/TEXT/gpt2_genesis/genesis_dataset/", type=str, help='Custom datasets to train on.')
        parser.add_argument('--miner.config_file', type=str, help='config file to run this neuron, if not using cmd line arguments.')
        GPT2Synapse.add_args(parser)
        bittensor.neuron.Neuron.add_args(parser)

    @staticmethod
    def check_config(config: Munch):
        assert config.miner.batch_size_train > 0, "batch_size_train must a positive value"
        assert config.miner.learning_rate > 0, "learning_rate must be a positive value."
        config.miner.custom_dataset = os.path.expanduser(config.miner.custom_dataset)
        full_path = '{}/{}/{}'.format(config.miner.root_dir, config.miner.name, config.miner.trial_uid)
        config.miner.full_path = os.path.expanduser(full_path)
        if not os.path.exists(config.miner.full_path):
            os.makedirs(config.miner.full_path)


    def configure_optimizers(self):
        """
        This long function is unfortunately doing something very simple and is being very defensive:
        We are separating out all parameters of the model into two buckets: those that will experience
        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
        We are then returning the PyTorch optimizer object.

        """

        # separate out all parameters to those that will and won't experience regularizing weight decay
        decay = set()
        no_decay = set()
        whitelist_weight_modules = (torch.nn.Linear, )
        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding, torch.nn.Tanh)
        for mn, m in self.model.named_modules():
            for pn, p in m.named_parameters():
                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name

                if pn.endswith('bias'):
                    # all biases will not be decayed
                    no_decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
                    # weights of whitelist modules will be weight decayed
                    decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
                    # weights of blacklist modules will NOT be weight decayed
                    no_decay.add(fpn)

        # special case the position embedding parameter in the root GPT module as not decayed
        no_decay.add('pos_emb')

        # validate that we considered every parameter
        param_dict = {pn: p for pn, p in self.model.named_parameters()}
        inter_params = decay & no_decay
        union_params = decay | no_decay
        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
                                                    % (str(param_dict.keys() - union_params), )

        # create the pytorch optimizer object
        optim_groups = [
            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": self.config.miner.weight_decay},
            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
        ]
        optimizer = torch.optim.AdamW(optim_groups, lr=self.config.miner.learning_rate, betas=(0.9, 0.95))
        return optimizer

    # --- Main loop ----
    def run (self):

        # ---- Subscribe ----
        with self.neuron:

            # ---- Weights ----
            self.row = self.neuron.metagraph.row.to(self.model.device)

            # --- Run state ---
            self.global_step = 0

            # --- Loop for epochs ---
            for self.epoch in range(self.config.miner.n_epochs):

                # ---- Serve ----
                self.neuron.axon.serve( self.model )

                # ---- Train Model ----
                self.train()

                # If model has borked for some reason, we need to make sure it doesn't emit weights
                # Instead, reload into previous version of model
                if torch.any(torch.isnan(torch.cat([param.view(-1) for param in self.model.parameters()]))):
                    self.model, self.optimizer = self.model_toolbox.load_model(self.config)
                    continue

                # ---- Emitting weights ----
                self.neuron.metagraph.set_weights(self.row, wait_for_inclusion = True) # Sets my row-weights on the chain.

                # ---- Sync metagraph ----
                self.neuron.metagraph.sync() # Pulls the latest metagraph state (with my update.)
                self.row = self.neuron.metagraph.row.to(self.model.device)

                # ---- Update Tensorboard ----
                self.neuron.dendrite.__to_tensorboard__(self.tensorboard, self.global_step)
                self.neuron.metagraph.__to_tensorboard__(self.tensorboard, self.global_step)
                self.neuron.axon.__to_tensorboard__(self.tensorboard, self.global_step)

                # ---- Save best loss and model ----
                if self.training_loss < self.best_train_loss: #self.epoch % 10 == 0:
                        self.best_train_loss = self.training_loss  # update best train loss
                        self.model_toolbox.save_model(
                            self.config.miner.full_path,
                            {
                                'epoch': self.epoch,
                                'model_state_dict': self.model.state_dict(),
                                'loss': self.best_train_loss,
                                'optimizer_state_dict': self.optimizer.state_dict(),
                            }
                        )
                        self.tensorboard.add_scalar('Neuron/Train_loss', self.training_loss, self.global_step)
                logger.info("This epoch's training loss: {}...Current best training loss: {}".format(self.training_loss, self.best_train_loss))


    def decay_learning_rate(self, batch):
        """Decay the learning rate based on the progress thus far.
        Adjusts the self.config.miner.learning_rate according to the
        tokens processed so far, returns number of tokens.

        Args:
            tokens (int): Number of tokens processed so far.
        """

        if self.config.miner.lr_decay:
            # number of tokens processed this step
            self.tokens += (batch >= 0).sum()
            if self.tokens < self.config.miner.warmup_tokens:
                # linear warmup
                lr_mult = float(self.tokens) / float(max(1, self.config.miner.warmup_tokens))
            else:
                # cosine learning rate decay
                progress = float(self.tokens - self.config.miner.warmup_tokens) / float(max(1, self.config.miner.final_tokens - self.config.miner.warmup_tokens))
                lr_mult = max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress)))

            self.lr = self.config.miner.learning_rate * lr_mult

            for param_group in self.optimizer.param_groups:
                param_group['lr'] = self.lr
        else:
            self.lr = self.config.miner.learning_rate


    def shuffle_dataset_epoch_length(self):
        """Shuffles the miner's dataset so we get a shuffled, randomized dataset
        of length miner.epoch_length

        Returns:
            [list] : shuffled dataset of length miner.epoch_length
        """

        shuffled_dataset = []
        loader = DataLoader(self.dataset, shuffle=True,
                        batch_size=self.config.miner.batch_size_train,
                        num_workers=self.config.miner.num_workers)


        for it, batch in enumerate(loader):
            shuffled_dataset.append(batch)
            if it == self.config.miner.epoch_length:
                break

        return shuffled_dataset

    def get_lr(self):
        for param_group in self.optimizer.param_groups:
            return param_group['lr']

    # ---- Train Epoch ----
    def train(self):

        def run_epoch():
            self.model.train(True)
            losses = []

            # Re-create dataloader every time we call train
            # This way, since epoch_length < len(dataset), we can
            # make sure that the dataset is randomly shuffled each time
            # we train for an epoch.
            logger.info("Preparing dataset batch...")
            dataset = self.shuffle_dataset_epoch_length()
            pbar = qqdm(enumerate(dataset), total=len(dataset), desc=format_str('blue', f'Epoch Progress'))

            for it, (batch) in pbar:
                batch = batch.to(self.model.device)
                output = self.model.remote_forward(self.neuron, batch, training=True)
                loss = output.local_target_loss + output.distillation_loss + output.remote_target_loss
                loss.backward()

                clip_grad_norm_(self.model.parameters(), self.config.miner.clip_gradients)
                self.optimizer.step()
                self.optimizer.zero_grad()

                self.decay_learning_rate(batch)

                losses.append(loss.item())

                 # ---- Train row weights ----
                batch_weights = torch.mean(output.router.weights, axis = 0).to(self.model.device) # Average over batch.
                self.row = (1 - 0.03) * self.row + 0.03 * batch_weights # Moving avg update.
                self.row = F.normalize(self.row, p = 1, dim = 0) # Ensure normalization.

                pbar.set_infos({
                    'GS': colored('{}'.format(self.global_step), 'red'),
                    'LS': colored('{}'.format(it), 'blue'),
                    'Epoch': colored('{}'.format(self.epoch+1), 'green'),
                    'Local loss': colored('{:.5f}'.format(output.local_target_loss.item()), 'red'),
                    'Remote loss': colored('{:.5f}'.format(output.remote_target_loss.item()), 'blue'),
                    'Distillation loss': colored('{:.5f}'.format(output.distillation_loss.item()), 'green'),
                    'Learning Rate:': colored('{:e}'.format(self.lr), 'white'),
                    'Axon': self.neuron.axon.__str__(),
                    'Dendrite': self.neuron.dendrite.__str__(),
                })

                self.tensorboard.add_scalar('Neuron/Rloss', output.remote_target_loss.item(), self.global_step)
                self.tensorboard.add_scalar('Neuron/Lloss', output.local_target_loss.item(), self.global_step)
                self.tensorboard.add_scalar('Neuron/Dloss', output.distillation_loss.item(), self.global_step)
                self.global_step += 1


            avg_loss = sum(losses) / len(losses)
            self.training_loss = avg_loss

        run_epoch()
Beispiel #12
0
class Miner():
    def __init__(self, config: Munch = None, **kwargs):
        if config == None:
            config = Miner.default_config()
        bittensor.config.Config.update_with_kwargs(config.miner, kwargs)
        Miner.check_config(config)
        self.config = config

        # ---- Neuron ----
        self.neuron = bittensor.neuron.Neuron(self.config)

        # ---- Model ----
        self.model = FFNNSynapse(
            config)  # Feedforward neural network with PKMRouter.
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)  # Set model to device

        # ---- Optimizer ----
        self.optimizer = optim.SGD(self.model.parameters(),
                                   lr=self.config.miner.learning_rate,
                                   momentum=self.config.miner.momentum)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer,
                                                         step_size=10.0,
                                                         gamma=0.1)

        # ---- Model Load/Save tools ----
        self.model_toolbox = ModelToolbox(FFNNSynapse, optim.SGD)

        # ---- Dataset ----
        self.train_data = torchvision.datasets.MNIST(
            root=self.config.miner.root_dir + "datasets/",
            train=True,
            download=True,
            transform=transforms.ToTensor())
        self.trainloader = torch.utils.data.DataLoader(
            self.train_data,
            batch_size=self.config.miner.batch_size_train,
            shuffle=True,
            num_workers=2)
        self.test_data = torchvision.datasets.MNIST(
            root=self.config.miner.root_dir + "datasets/",
            train=False,
            download=True,
            transform=transforms.ToTensor())
        self.testloader = torch.utils.data.DataLoader(
            self.test_data,
            batch_size=self.config.miner.batch_size_test,
            shuffle=False,
            num_workers=2)

        # ---- Tensorboard ----
        self.global_step = 0
        self.tensorboard = SummaryWriter(log_dir=self.config.miner.full_path)
        if self.config.miner.record_log:
            logger.add(
                self.config.miner.full_path + "/{}_{}.log".format(
                    self.config.miner.name, self.config.miner.trial_uid),
                format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}")

    @staticmethod
    def default_config() -> Munch:
        parser = argparse.ArgumentParser()
        Miner.add_args(parser)
        config = bittensor.config.Config.to_config(parser)
        return config

    @staticmethod
    def add_args(parser: argparse.ArgumentParser):
        parser.add_argument('--miner.learning_rate',
                            default=0.01,
                            type=float,
                            help='Training initial learning rate.')
        parser.add_argument('--miner.momentum',
                            default=0.9,
                            type=float,
                            help='Training initial momentum for SGD.')
        parser.add_argument('--miner.n_epochs',
                            default=int(sys.maxsize),
                            type=int,
                            help='Number of training epochs.')
        parser.add_argument(
            '--miner.epoch_length',
            default=int(sys.maxsize),
            type=int,
            help='Iterations of training per epoch (or dataset EOF)')
        parser.add_argument('--miner.batch_size_train',
                            default=64,
                            type=int,
                            help='Training batch size.')
        parser.add_argument('--miner.batch_size_test',
                            default=64,
                            type=int,
                            help='Testing batch size.')
        parser.add_argument('--miner.log_interval',
                            default=150,
                            type=int,
                            help='Batches until miner prints log statements.')
        parser.add_argument(
            '--miner.sync_interval',
            default=10,
            type=int,
            help='Batches before we we sync with chain and emit new weights.')
        parser.add_argument(
            '--miner.root_dir',
            default='~/.bittensor/miners/',
            type=str,
            help='Root path to load and save data associated with each miner')
        parser.add_argument(
            '--miner.name',
            default='mnist',
            type=str,
            help='Trials for this miner go in miner.root / miner.name')
        parser.add_argument(
            '--miner.trial_uid',
            default=str(time.time()).split('.')[0],
            type=str,
            help='Saved models go in miner.root_dir / miner.name / miner.uid')
        parser.add_argument('--miner.record_log',
                            default=False,
                            help='Record all logs when running this miner')
        parser.add_argument(
            '--miner.config_file',
            type=str,
            help=
            'config file to run this neuron, if not using cmd line arguments.')
        bittensor.neuron.Neuron.add_args(parser)
        FFNNSynapse.add_args(parser)

    @staticmethod
    def check_config(config: Munch):
        assert config.miner.log_interval > 0, "log_interval dimension must be positive"
        assert config.miner.momentum > 0 and config.miner.momentum < 1, "momentum must be a value between 0 and 1"
        assert config.miner.batch_size_train > 0, "batch_size_train must be a positive value"
        assert config.miner.batch_size_test > 0, "batch_size_test must be a positive value"
        assert config.miner.learning_rate > 0, "learning rate must be be a positive value."
        full_path = '{}/{}/{}/'.format(config.miner.root_dir,
                                       config.miner.name,
                                       config.miner.trial_uid)
        config.miner.full_path = os.path.expanduser(full_path)
        if not os.path.exists(config.miner.full_path):
            os.makedirs(config.miner.full_path)

    # --- Main loop ----
    def run(self):

        # ---- Subscribe neuron ----
        with self.neuron:

            # ---- Weights ----
            self.row = self.neuron.metagraph.row.to(self.model.device)

            # --- Loop for epochs ---
            self.best_test_loss = math.inf
            self.global_step = 0
            for self.epoch in range(self.config.miner.n_epochs):
                # ---- Serve ----
                self.neuron.axon.serve(self.model)

                # ---- Train ----
                self.train()
                self.scheduler.step()

                # If model has borked for some reason, we need to make sure it doesn't emit weights
                # Instead, reload into previous version of model
                if torch.any(
                        torch.isnan(
                            torch.cat([
                                param.view(-1)
                                for param in self.model.parameters()
                            ]))):
                    self.model, self.optimizer = self.model_toolbox.load_model(
                        self.config)
                    continue

                # ---- Test ----
                test_loss, test_accuracy = self.test()

                # ---- Emit ----
                self.neuron.metagraph.set_weights(
                    self.row, wait_for_inclusion=True
                )  # Sets my row-weights on the chain.

                # ---- Sync ----
                self.neuron.metagraph.sync(
                )  # Pulls the latest metagraph state (with my update.)
                self.row = self.neuron.metagraph.row.to(self.device)

                # --- Display Epoch ----
                print(self.neuron.axon.__full_str__())
                print(self.neuron.dendrite.__full_str__())
                print(self.neuron.metagraph)

                # ---- Update Tensorboard ----
                self.neuron.dendrite.__to_tensorboard__(
                    self.tensorboard, self.global_step)
                self.neuron.metagraph.__to_tensorboard__(
                    self.tensorboard, self.global_step)
                self.neuron.axon.__to_tensorboard__(self.tensorboard,
                                                    self.global_step)

                # ---- Save ----
                if test_loss < self.best_test_loss:
                    self.best_test_loss = test_loss  # Update best loss.
                    self.model_toolbox.save_model(
                        self.config.miner.full_path, {
                            'epoch': self.epoch,
                            'model_state_dict': self.model.state_dict(),
                            'loss': self.best_test_loss,
                            'optimizer_state_dict':
                            self.optimizer.state_dict(),
                        })
                    self.tensorboard.add_scalar('Test loss', test_loss,
                                                self.global_step)

    # ---- Train epoch ----
    def train(self):
        # ---- Init training state ----
        self.model.train()  # Turn on dropout etc.
        for batch_idx, (images, targets) in enumerate(self.trainloader):
            if batch_idx >= self.config.miner.epoch_length:
                break
            self.global_step += 1

            # ---- Remote Forward pass ----
            output = self.model.remote_forward(
                neuron=self.neuron,
                images=images.to(self.device),
                targets=torch.LongTensor(targets).to(self.device),
            )

            # ---- Remote Backward pass ----
            loss = output.remote_target_loss + output.local_target_loss + output.distillation_loss
            loss.backward()  # Accumulates gradients on the model.
            self.optimizer.step()  # Applies accumulated gradients.
            self.optimizer.zero_grad(
            )  # Zeros out gradients for next accummulation

            # ---- Train weights ----
            batch_weights = torch.mean(output.router.weights, axis=0).to(
                self.model.device)  # Average over batch.
            self.row = (
                1 -
                0.03) * self.row + 0.03 * batch_weights  # Moving avg update.
            self.row = F.normalize(self.row, p=1,
                                   dim=0)  # Ensure normalization.

            # ---- Step Logs + Tensorboard ----
            processed = ((batch_idx + 1) * self.config.miner.batch_size_train)
            progress = (100. * processed) / len(self.train_data)
            logger.info(
                'GS: {}\t Epoch: {} [{}/{} ({})]\tLoss: {}\tAcc: {}\tAxon: {}\tDendrite: {}',
                colored('{}'.format(self.global_step), 'blue'),
                colored('{}'.format(self.epoch), 'blue'),
                colored('{}'.format(processed), 'green'),
                colored('{}'.format(len(self.train_data)), 'red'),
                colored('{:.2f}%'.format(progress), 'green'),
                colored('{:.4f}'.format(output.local_target_loss.item()),
                        'green'),
                colored('{:.4f}'.format(output.local_accuracy.item()),
                        'green'), self.neuron.axon, self.neuron.dendrite)
            self.tensorboard.add_scalar('Rloss',
                                        output.remote_target_loss.item(),
                                        self.global_step)
            self.tensorboard.add_scalar('Lloss',
                                        output.local_target_loss.item(),
                                        self.global_step)
            self.tensorboard.add_scalar('Dloss',
                                        output.distillation_loss.item(),
                                        self.global_step)

    # --- Test epoch ----
    def test(self):
        with torch.no_grad(
        ):  # Turns off gradient computation for inference speed up.
            self.model.eval()  # Turns off Dropoutlayers, BatchNorm etc.
            loss = 0.0
            accuracy = 0.0
            for _, (images, labels) in enumerate(self.testloader):

                # ---- Local Forward pass ----
                outputs = self.model.local_forward(
                    images=images.to(self.device),
                    targets=torch.LongTensor(labels).to(self.device),
                )
                loss += outputs.local_target_loss.item()
                accuracy += outputs.local_accuracy.item()

            return loss / len(self.testloader), accuracy / len(self.testloader)
Beispiel #13
0
class Miner( bittensor.miner.Miner ):

    def __init__(self, config: Munch = None, **kwargs):
        if config == None:
            config = Miner.default_config();       
        bittensor.config.Config.update_with_kwargs(config.miner, kwargs) 
        Miner.check_config(config)
        self.config = config

        # ---- Model ----
        self.model = BertMLMSynapse( self.config )

        # ---- Optimizer ----
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr = self.config.miner.learning_rate, momentum=self.config.miner.momentum)
        self.scheduler = WarmupCosineWithHardRestartsSchedule(self.optimizer, 50, 300)

        # ---- Model Load/Save tools ----
        self.model_toolbox = ModelToolbox(BertMLMSynapse, torch.optim.SGD)

        # ---- Dataset ----
        # Dataset: 74 million sentences pulled from books.
        self.dataset = load_dataset('ag_news')['train']
        # The collator accepts a list [ dict{'input_ids, ...; } ] where the internal dict 
        # is produced by the tokenizer.
        self.data_collator = DataCollatorForLanguageModeling (
            tokenizer=bittensor.__tokenizer__(), mlm=True, mlm_probability=0.15
        )
        super( Miner, self ).__init__( self.config, **kwargs )

    @staticmethod
    def default_config() -> Munch:
        parser = argparse.ArgumentParser(); 
        Miner.add_args(parser) 
        config = bittensor.config.Config.to_config(parser); 
        return config

    @staticmethod
    def check_config(config: Munch):
        assert config.miner.momentum > 0 and config.miner.momentum < 1, "momentum must be a value between 0 and 1"
        assert config.miner.batch_size_train > 0, "batch_size_train must a positive value"
        assert config.miner.learning_rate > 0, "learning_rate must be a positive value."
        BertMLMSynapse.check_config( config )
        bittensor.miner.Miner.check_config( config )

    @staticmethod
    def add_args(parser: argparse.ArgumentParser):
        parser.add_argument('--miner.learning_rate', default=0.01, type=float, help='Training initial learning rate.')
        parser.add_argument('--miner.momentum', default=0.98, type=float, help='Training initial momentum for SGD.')
        parser.add_argument('--miner.clip_gradients', default=0.8, type=float, help='Implement gradient clipping to avoid exploding loss on smaller architectures.')
        parser.add_argument('--miner.n_epochs', default=int(sys.maxsize), type=int, help='Number of training epochs.')
        parser.add_argument('--miner.epoch_length', default=500, type=int, help='Iterations of training per epoch')
        parser.add_argument('--miner.batch_size_train', default=1, type=int, help='Training batch size.')
        parser.add_argument('--miner.name', default='bert_mlm', type=str, help='Trials for this miner go in miner.root / (wallet_cold - wallet_hot) / miner.name ')
        BertMLMSynapse.add_args(parser)
        bittensor.miner.Miner.add_args(parser)

    # --- Main loop ----
    def run (self):

        # ---- Subscribe ----
        with self:

            # ---- Weights ----
            self.row = self.metagraph.row

            # --- Run state ---
            self.global_step = 0
            self.best_train_loss = math.inf

            # --- Loop for epochs ---
            for self.epoch in range(self.config.miner.n_epochs):
                try:
                    # ---- Serve ----
                    self.axon.serve( self.model )

                    # ---- Train Model ----
                    self.train()
                    self.scheduler.step()

                    # If model has borked for some reason, we need to make sure it doesn't emit weights
                    # Instead, reload into previous version of model
                    if torch.any(torch.isnan(torch.cat([param.view(-1) for param in self.model.parameters()]))):
                        self.model, self.optimizer = self.model_toolbox.load_model(self.config)    
                        continue

                    # ---- Emitting weights ----
                    self.metagraph.set_weights(self.row, wait_for_inclusion = True) # Sets my row-weights on the chain.

                    # ---- Sync metagraph ----
                    self.metagraph.sync() # Pulls the latest metagraph state (with my update.)
                    self.row = self.metagraph.row
                    logger.info(self.metagraph)

                    # ---- Update Tensorboard ----
                    self.dendrite.__to_tensorboard__(self.tensorboard, self.global_step)
                    self.metagraph.__to_tensorboard__(self.tensorboard, self.global_step)
                    self.axon.__to_tensorboard__(self.tensorboard, self.global_step)
                
                    # ---- Save best loss and model ----
                    if self.training_loss and self.epoch % 10 == 0:
                        if self.training_loss < self.best_train_loss:
                            self.best_train_loss = self.training_loss # update best train loss
                            self.model_toolbox.save_model(
                                self.config.miner.full_path,
                                {
                                    'epoch': self.epoch, 
                                    'model_state_dict': self.model.state_dict(), 
                                    'loss': self.best_train_loss,
                                    'optimizer_state_dict': self.optimizer.state_dict(),
                                }
                            )
                            self.tensorboard.add_scalar('Neuron/Train_loss', self.training_loss, self.global_step)
                    
                # --- Catch Errors ----
                except Exception as e:
                    logger.error('Exception in training script with error: {}', e)
                    logger.info(traceback.print_exc())
                    logger.info('Continuing to train.')
                    time.sleep(1)
    
    # ---- Train Epoch ----
    def train(self):
        self.training_loss = 0.0
        for local_step in range(self.config.miner.epoch_length):
            # ---- Forward pass ----
            inputs, targets = mlm_batch(self.dataset, self.config.miner.batch_size_train, bittensor.__tokenizer__(), self.data_collator)
            output = self.model.remote_forward (
                    self,
                    inputs = inputs.to(self.model.device), 
                    targets = targets.to(self.model.device)
            )

            # ---- Backward pass ----
            loss = output.local_target_loss + output.distillation_loss + output.remote_target_loss
            loss.backward() # Accumulates gradients on the model.
            clip_grad_norm_(self.model.parameters(), self.config.miner.clip_gradients) # clip model gradients
            self.optimizer.step() # Applies accumulated gradients.
            self.optimizer.zero_grad() # Zeros out gradients for next accummulation

            # ---- Train row weights ----
            batch_weights = torch.mean(output.router.weights, axis = 0) # Average over batch.
            self.row = (1 - 0.03) * self.row + 0.03 * batch_weights # Moving avg update.
            self.row = F.normalize(self.row, p = 1, dim = 0) # Ensure normalization.

            # ---- Step logs ----
            logger.info('GS: {} LS: {} Epoch: {}\tLocal Target Loss: {}\tRemote Target Loss: {}\tDistillation Loss: {}\tAxon: {}\tDendrite: {}',
                    colored('{}'.format(self.global_step), 'red'),
                    colored('{}'.format(local_step), 'blue'),
                    colored('{}'.format(self.epoch), 'green'),
                    colored('{:.4f}'.format(output.local_target_loss.item()), 'green'),
                    colored('{:.4f}'.format(output.remote_target_loss.item()), 'blue'),
                    colored('{:.4f}'.format(output.distillation_loss.item()), 'red'),
                    self.axon,
                    self.dendrite)
            logger.info('Codes: {}', output.router.return_codes.tolist())
            
            self.tensorboard.add_scalar('Neuron/Rloss', output.remote_target_loss.item(), self.global_step)
            self.tensorboard.add_scalar('Neuron/Lloss', output.local_target_loss.item(), self.global_step)
            self.tensorboard.add_scalar('Neuron/Dloss', output.distillation_loss.item(), self.global_step)

            # ---- Step increments ----
            self.global_step += 1
            self.training_loss += output.local_target_loss.item()

            # --- Memory clean up ----
            torch.cuda.empty_cache()
            del output
Beispiel #14
0
class Miner():
    def __init__(self, config: Munch = None, **kwargs):
        if config == None:
            config = Miner.default_config()
        bittensor.config.Config.update_with_kwargs(config.miner, kwargs)
        Miner.check_config(config)
        self.config = config

        # ---- Build Neuron ----
        self.neuron = bittensor.neuron.Neuron(config)

        # ---- Build FFNN Model ----
        self.model = FFNNSynapse(self.config)
        self.model.to(
            torch.device("cuda" if torch.cuda.is_available() else "cpu"))
        self.neuron.axon.serve(self.model)

        # ---- Optimizer ----
        self.optimizer = torch.optim.SGD(self.model.parameters(),
                                         lr=self.config.miner.learning_rate,
                                         momentum=self.config.miner.momentum)

        # ---- Model Load/Save tools ----
        self.model_toolbox = ModelToolbox(FFNNSynapse, torch.optim.SGD)

        # ---- Logging ----
        self.tensorboard = SummaryWriter(log_dir=self.config.miner.full_path)
        if self.config.miner.record_log:
            logger.add(
                self.config.miner.full_path + "/{}_{}.log".format(
                    self.config.miner.name, self.config.miner.trial_uid),
                format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}")

    @staticmethod
    def default_config() -> Munch:
        parser = argparse.ArgumentParser()
        Miner.add_args(parser)
        config = bittensor.config.Config.to_config(parser)
        return config

    @staticmethod
    def add_args(parser: argparse.ArgumentParser):
        parser.add_argument('--miner.learning_rate',
                            default=0.01,
                            type=float,
                            help='Training initial learning rate.')
        parser.add_argument('--miner.momentum',
                            default=0.9,
                            type=float,
                            help='Training initial momentum for SGD.')
        parser.add_argument('--miner.n_epochs',
                            default=int(sys.maxsize),
                            type=int,
                            help='Number of training epochs.')
        parser.add_argument(
            '--miner.sync_interval',
            default=150,
            type=int,
            help='Batches before we we sync with chain and emit new weights.')
        parser.add_argument(
            '--miner.root_dir',
            default='~/.bittensor/miners/',
            type=str,
            help='Root path to load and save data associated with each miner')
        parser.add_argument(
            '--miner.name',
            default='ffnn-grunt',
            type=str,
            help='Trials for this miner go in miner.root / miner.name')
        parser.add_argument(
            '--miner.trial_uid',
            default=str(time.time()).split('.')[0],
            type=str,
            help='Saved models go in miner.root_dir / miner.name / miner.uid')
        parser.add_argument('--miner.record_log',
                            default=False,
                            help='Record all logs when running this miner')
        parser.add_argument(
            '--miner.config_file',
            type=str,
            help=
            'config file to run this neuron, if not using cmd line arguments.')
        bittensor.neuron.Neuron.add_args(parser)
        FFNNSynapse.add_args(parser)

    @staticmethod
    def check_config(config: Munch):
        assert config.miner.momentum > 0 and config.miner.momentum < 1, "momentum must be a value between 0 and 1"
        assert config.miner.learning_rate > 0, "learning rate must be be a positive value."
        full_path = '{}/{}/{}/'.format(config.miner.root_dir,
                                       config.miner.name,
                                       config.miner.trial_uid)
        config.miner.full_path = os.path.expanduser(full_path)
        if not os.path.exists(config.miner.full_path):
            os.makedirs(config.miner.full_path)

    # ---- Main loop ----
    def run(self):

        # --- Subscribe / Update neuron ---
        with self.neuron:

            # ---- Loop for epochs ----
            self.model.train()
            for self.epoch in range(self.config.miner.n_epochs):

                # ---- Poll until gradients ----
                public_key, inputs_x, grads_dy, modality_x = self.neuron.axon.gradients.get(
                    block=True)

                # ---- Backward Gradients ----
                # TODO (const): batch normalization over the gradients for consistency.
                grads_dy = torch.where(torch.isnan(grads_dy),
                                       torch.zeros_like(grads_dy), grads_dy)
                self.model.backward(inputs_x, grads_dy, modality_x)

                # ---- Apply Gradients ----
                self.optimizer.step()  # Apply accumulated gradients.
                self.optimizer.zero_grad()  # Clear any lingering gradients

                # If model has borked for some reason, we need to make sure it doesn't emit weights
                # Instead, reload into previous version of the model
                if torch.any(
                        torch.isnan(
                            torch.cat([
                                param.view(-1)
                                for param in self.model.parameters()
                            ]))):
                    self.model, self.optimizer = self.model_toolbox.load_model(
                        self.config)

                # ---- Serve latest model ----
                self.neuron.axon.serve(self.model)  # Serve the newest model.
                logger.info('Step: {} \t Key: {} \t sum(W[:,0])', self.epoch,
                            public_key,
                            torch.sum(self.neuron.metagraph.col).item())

                # ---- Sync State ----
                if (self.epoch + 1) % self.config.miner.sync_interval == 0:

                    # --- Display Epoch ----
                    print(self.neuron.axon.__full_str__())
                    print(self.neuron.dendrite.__full_str__())
                    print(self.neuron.metagraph)

                    # ---- Sync metagrapn from chain ----
                    self.neuron.metagraph.sync()  # Sync with the chain.

                    # --- Save Model ----
                    self.model_toolbox.save_model(
                        self.config.miner.full_path, {
                            'epoch': self.epoch,
                            'model_state_dict': self.model.state_dict(),
                            'optimizer_state_dict':
                            self.optimizer.state_dict(),
                        })
Beispiel #15
0
class Miner( bittensor.miner.Miner ):

    def __init__(self, config: Munch = None, **kwargs):
        if config == None:
            config = Miner.default_config()
        bittensor.config.Config.update_with_kwargs(config.miner, kwargs)
        Miner.check_config(config)
        self.config = config

        # ---- Model ----
        self.model = GPT2Synapse( self.config )

        # ---- Model Load/Save tools ----
        self.model_toolbox = ModelToolbox(GPT2Synapse, AdamW)

        # ---- Optimizer ----
        self.optimizer = self.configure_optimizers()
        self.lr = self.config.miner.learning_rate
        self.training_loss = math.inf
        self.best_train_loss = math.inf
        self.rloss = math.inf
        self.lloss = math.inf
        self.dloss = math.inf

        # ---- Dataset ----
        # The Genesis Dataset:
        # The dataset used to train Adam and his first 100 children.
        self.dataset = GenesisTextDataloader(self.config.miner.batch_size_train, self.model.get_block_size())
        self.tokens = 0
        super( Miner, self ).__init__( self.config, **kwargs )
               
    @staticmethod
    def default_config() -> Munch:
        parser = argparse.ArgumentParser()
        Miner.add_args(parser)
        config = bittensor.config.Config.to_config(parser)
        return config

    @staticmethod
    def add_args(parser: argparse.ArgumentParser):
        parser.add_argument(
            '--miner.learning_rate', 
            default=3e-2, 
            type=float, 
            help='Training initial learning rate.'
        )
        parser.add_argument(
            '--miner.weight_decay', 
            default=0.25, 
            type=float, 
            help='Model parameter weight decay.'
        )
        parser.add_argument(
            '--miner.lr_decay',
            default=True,
            type=bool,
            help='learning rate decay params: linear warmup followed by cosine decay to 10%% of original.'
        )
        parser.add_argument(
            '--miner.warmup_tokens',
            default=375e6,
            type=float,
            help='A linear LR warmup over the first miner.warmup_tokens tokens (default is 365 million)'
        )
        parser.add_argument(
            '--miner.final_tokens',
            default=260e9,
            type=float,
            help='At what point we reach 10%% of original LR'
        )
        parser.add_argument(
            '--miner.clip_gradients',
            default=1.0,
            type=float,
            help='Implement gradient clipping to avoid exploding loss on smaller architectures.'
        )
        parser.add_argument(
            '--miner.n_epochs', 
            default=int(sys.maxsize), 
            type=int, 
            help='Number of training epochs.'
        )
        parser.add_argument(
            '--miner.epoch_length', 
            default=500, 
            type=int, 
            help='Iterations of training per epoch'
        )
        parser.add_argument(
            '--miner.batch_size_train', 
            default=2, 
            type=int, 
            help='Training batch size.'
        )
        parser.add_argument('--miner.name', default='gpt2_genesis', type=str, help='Trials for this miner go in miner.root / (wallet_cold - wallet_hot) / miner.name ')
        GPT2Synapse.add_args( parser )
        bittensor.miner.Miner.add_args( parser )
        GenesisTextDataloader.add_args( parser )

    @staticmethod
    def check_config(config: Munch):
        assert config.miner.batch_size_train > 0, "batch_size_train must a positive value"
        assert config.miner.learning_rate > 0, "learning_rate must be a positive value."
        bittensor.miner.Miner.check_config( config )
        GenesisTextDataloader.check_config( config )

    def configure_optimizers(self):
        """
        This long function is unfortunately doing something very simple and is being very defensive:
        We are separating out all parameters of the model into two buckets: those that will experience
        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
        We are then returning the PyTorch optimizer object.

        """

        # separate out all parameters to those that will and won't experience regularizing weight decay
        decay = set()
        no_decay = set()
        whitelist_weight_modules = (torch.nn.Linear, )
        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding, torch.nn.Tanh)
        for mn, m in self.model.named_modules():
            for pn, p in m.named_parameters():
                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name

                if pn.endswith('bias'):
                    # all biases will not be decayed
                    no_decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
                    # weights of whitelist modules will be weight decayed
                    decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
                    # weights of blacklist modules will NOT be weight decayed
                    no_decay.add(fpn)

        # special case the position embedding parameter in the root GPT module as not decayed
        no_decay.add('pos_emb')

        # validate that we considered every parameter
        param_dict = {pn: p for pn, p in self.model.named_parameters()}
        inter_params = decay & no_decay
        union_params = decay | no_decay
        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
                                                    % (str(param_dict.keys() - union_params), )

        # create the pytorch optimizer object
        optim_groups = [
            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": self.config.miner.weight_decay},
            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
        ]
        optimizer = torch.optim.AdamW(optim_groups, lr=self.config.miner.learning_rate, betas=(0.9, 0.95))
        return optimizer

    # --- Main loop ----
    def run (self):

        # ---- Subscribe ----
        with self:

            # ---- Weights ----
            self.row = self.metagraph.row.to(self.model.device)

            # --- Run state ---
            self.global_step = 0

            # --- Loop for epochs ---
            for self.epoch in range(self.config.miner.n_epochs):

                # ---- Serve ----
                self.axon.serve( self.model )

                # ---- Train Model ----
                self.train()

                # If model has borked for some reason, we need to make sure it doesn't emit weights
                # Instead, reload into previous version of model
                if torch.any(torch.isnan(torch.cat([param.view(-1) for param in self.model.parameters()]))):
                    self.model, self.optimizer = self.model_toolbox.load_model(self.config)
                    continue

                # ---- Emitting weights ----
                try:
                    self.metagraph.set_weights(self.row, wait_for_inclusion = True) # Sets my row-weights on the chain.
                except Exception as e:
                    logger.error("Failed to set weights")
                    raise e

                try:
                    # ---- Sync metagraph ----
                    self.metagraph.sync() # Pulls the latest metagraph state (with my update.)
                except Exception as e:
                    logger.error("Failed to sync metagraph")
                    raise e

                
                self.row = self.metagraph.row.to(self.model.device)
                # ---- Update Tensorboard ----
                self.dendrite.__to_tensorboard__(self.tensorboard, self.global_step)
                self.metagraph.__to_tensorboard__(self.tensorboard, self.global_step)
                self.axon.__to_tensorboard__(self.tensorboard, self.global_step)

                # ---- Save best loss and model ----
                if self.training_loss < self.best_train_loss: #self.epoch % 10 == 0:
                        self.best_train_loss = self.training_loss  # update best train loss
                        self.model_toolbox.save_model(
                            self.config.miner.full_path,
                            {
                                'epoch': self.epoch,
                                'model_state_dict': self.model.state_dict(),
                                'loss': self.best_train_loss/3,
                                'optimizer_state_dict': self.optimizer.state_dict(),
                                'rloss' : self.rloss,
                                'lloss': self.lloss,
                                'dloss': self.dloss,
                            }
                        )
                        self.tensorboard.add_scalar('Neuron/Train_loss', self.training_loss, self.global_step)
                logger.info("This epoch's training losses: L-Loss: {:.2f} | R-Loss: {:.2f} | D-Loss: {:.2f} | avg: {:.2f} ... Current best average training loss: {:.2f}".format(self.lloss, self.rloss, self.dloss, self.training_loss/3, self.best_train_loss/3))


    def decay_learning_rate(self, batch):
        """Decay the learning rate based on the progress thus far.
        Adjusts the self.config.miner.learning_rate according to the
        tokens processed so far, returns number of tokens.

        Args:
            tokens (int): Number of tokens processed so far.
        """

        if self.config.miner.lr_decay:
            # number of tokens processed this step
            self.tokens += (batch >= 0).sum()
            if self.tokens < self.config.miner.warmup_tokens:
                # linear warmup
                lr_mult = float(self.tokens) / float(max(1, self.config.miner.warmup_tokens))
            else:
                # cosine learning rate decay
                progress = float(self.tokens - self.config.miner.warmup_tokens) / float(max(1, self.config.miner.final_tokens - self.config.miner.warmup_tokens))
                lr_mult = max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress)))

            self.lr = self.config.miner.learning_rate * lr_mult

            for param_group in self.optimizer.param_groups:
                param_group['lr'] = self.lr
        else:
            self.lr = self.config.miner.learning_rate

    def get_lr(self):
        for param_group in self.optimizer.param_groups:
            return param_group['lr']

    # ---- Train Epoch ----
    def train(self):

        def run_epoch():
            self.model.train(True)
            losses = []
            rlosses = []
            llosses = []
            dlosses = []

            # we train for an epoch.
            logger.info("Preparing dataset batch...")
            # Set up the dataloader
            dataloader = self.dataset.dataloader(self.config.miner.epoch_length)
            pbar = qqdm(enumerate(dataloader), total=len(dataloader), desc=format_str('blue', f'Epoch Progress'))
            for it, (batch) in pbar:
                # ---- Forward pass ----
                batch = batch.to(self.model.device)
                output = self.model.remote_forward(self, batch, training=True)

                # ---- Backward pass ----
                loss = output.local_target_loss + output.distillation_loss + output.remote_target_loss
                loss.backward()

                # ---- Gradient Step ----
                clip_grad_norm_(self.model.parameters(), self.config.miner.clip_gradients)
                self.optimizer.step()
                self.optimizer.zero_grad()
                self.decay_learning_rate(batch)

                # Add losses up
                losses.append(loss.item())
                llosses.append(output.local_target_loss.item())
                rlosses.append(output.remote_target_loss.item())
                dlosses.append(output.distillation_loss.item())

                # ---- Train row weights ----
                batch_weights = torch.mean(output.router.weights, axis = 0).to(self.model.device) # Average over batch.
                self.row = (1 - 0.03) * self.row + 0.03 * batch_weights # Moving avg update.
                self.row = F.normalize(self.row, p = 1, dim = 0) # Ensure normalization.

                # ---- Logging ----
                index = self.metagraph.state.index_for_uid[self.metagraph.uid]
                pbar.set_infos({
                    'GS': colored('{}'.format(self.global_step), 'red'),
                    'LS': colored('{}'.format(it), 'blue'),
                    'Epoch': colored('{}'.format(self.epoch+1), 'green'),
                    'L-loss': colored('{:.5f}'.format(output.local_target_loss.item()), 'red'),
                    'R-loss': colored('{:.5f}'.format(output.remote_target_loss.item()), 'blue'),
                    'D-loss': colored('{:.5f}'.format(output.distillation_loss.item()), 'green'),
                    'lr': colored('{:e}'.format(self.lr), 'white'),
                    'nPeers': self.metagraph.n,
                    'Stake(\u03C4)': float(self.metagraph.S[index]),
                    'Rank(\u03C4)': float(self.metagraph.R[index]),
                    'Incentive(\u03C4/block)': float(self.metagraph.I[index]),
                    'Axon': self.axon.__str__(),
                    'Dendrite': self.dendrite.__str__(),
                })
                self.tensorboard.add_scalar('Neuron/Rloss', output.remote_target_loss.item(), self.global_step)
                self.tensorboard.add_scalar('Neuron/Lloss', output.local_target_loss.item(), self.global_step)
                self.tensorboard.add_scalar('Neuron/Dloss', output.distillation_loss.item(), self.global_step)
                self.global_step += 1


            avg_loss = sum(losses) / len(losses)
            self.rloss = sum(rlosses) / len(rlosses)
            self.lloss = sum(llosses) / len(llosses)
            self.dloss = sum(dlosses) / len(dlosses)

            self.training_loss = avg_loss

        run_epoch()
Beispiel #16
0
class Miner():

    def __init__(self, config: Munch = None):
        if config == None:
            config = Miner.build_config(); logger.info(bittensor.config.Config.toString(config))
        self.config = config

        # ---- Neuron ----
        self.neuron = bittensor.neuron.Neuron(self.config)

        # ---- Model ----
        self.model = BertMLMSynapse( self.config )

        # ---- Optimizer ----
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr = self.config.miner.learning_rate, momentum=self.config.miner.momentum)
        self.scheduler = WarmupCosineWithHardRestartsSchedule(self.optimizer, 50, 300)

        # ---- Model Load/Save tools ----
        self.model_toolbox = ModelToolbox(BertMLMSynapse, torch.optim.SGD)

        # ---- Dataset ----
        # Dataset: 74 million sentences pulled from books.
        self.dataset = load_dataset('ag_news')['train']
        # The collator accepts a list [ dict{'input_ids, ...; } ] where the internal dict 
        # is produced by the tokenizer.
        self.data_collator = DataCollatorForLanguageModeling (
            tokenizer=bittensor.__tokenizer__(), mlm=True, mlm_probability=0.15
        )

        # ---- Logging ----
        self.tensorboard = SummaryWriter(log_dir = self.config.miner.full_path)
        if self.config.miner.record_log:
            logger.add(self.config.miner.full_path + "/{}_{}.log".format(self.config.miner.name, self.config.miner.trial_uid),format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}")

    @staticmethod
    def build_config() -> Munch:
        parser = argparse.ArgumentParser(); 
        Miner.add_args(parser) 
        config = bittensor.config.Config.to_config(parser); 
        Miner.check_config(config)
        return config

    @staticmethod
    def check_config(config: Munch):
        assert config.miner.momentum > 0 and config.miner.momentum < 1, "momentum must be a value between 0 and 1"
        assert config.miner.batch_size_train > 0, "batch_size_train must a positive value"
        assert config.miner.learning_rate > 0, "learning_rate must be a positive value."
        full_path = '{}/{}/{}'.format(config.miner.root_dir, config.miner.name, config.miner.trial_uid)
        config.miner.full_path = os.path.expanduser(full_path)
        if not os.path.exists(config.miner.full_path):
            os.makedirs(config.miner.full_path)
        BertMLMSynapse.check_config(config)
        bittensor.neuron.Neuron.check_config(config)

    @staticmethod
    def add_args(parser: argparse.ArgumentParser):
        parser.add_argument('--miner.learning_rate', default=0.01, type=float, help='Training initial learning rate.')
        parser.add_argument('--miner.momentum', default=0.98, type=float, help='Training initial momentum for SGD.')
        parser.add_argument('--miner.n_epochs', default=int(sys.maxsize), type=int, help='Number of training epochs.')
        parser.add_argument('--miner.epoch_length', default=500, type=int, help='Iterations of training per epoch')
        parser.add_argument('--miner.batch_size_train', default=1, type=int, help='Training batch size.')
        parser.add_argument('--miner.sync_interval', default=100, type=int, help='Batches before we sync with chain and emit new weights.')
        parser.add_argument('--miner.log_interval', default=10, type=int, help='Batches before we log miner info.')
        parser.add_argument('--miner.accumulation_interval', default=1, type=int, help='Batches before we apply acummulated gradients.')
        parser.add_argument('--miner.apply_remote_gradients', default=False, type=bool, help='If true, neuron applies gradients which accumulate from remotes calls.')
        parser.add_argument('--miner.root_dir', default='~/.bittensor/miners/', type=str,  help='Root path to load and save data associated with each miner')
        parser.add_argument('--miner.name', default='bert-nsp', type=str, help='Trials for this miner go in miner.root / miner.name')
        parser.add_argument('--miner.trial_uid', default=str(time.time()).split('.')[0], type=str, help='Saved models go in miner.root_dir / miner.name / miner.uid')
        parser.add_argument('--miner.record_log', default=True, help='Record all logs when running this miner')
        parser.add_argument('--miner.config_file', type=str, help='config file to run this neuron, if not using cmd line arguments.')
        BertMLMSynapse.add_args(parser)
        bittensor.neuron.Neuron.add_args(parser)

    # --- Main loop ----
    def run (self):

        # ---- Subscribe ----
        with self.neuron:

            # ---- Weights ----
            self.row = self.neuron.metagraph.row

            # --- Run state ---
            self.global_step = 0
            self.best_train_loss = math.inf

            # --- Loop for epochs ---
            for self.epoch in range(self.config.miner.n_epochs):
                try:
                    # ---- Serve ----
                    self.neuron.axon.serve( self.model )

                    # ---- Train Model ----
                    self.train()
                    self.scheduler.step()

                    # If model has borked for some reason, we need to make sure it doesn't emit weights
                    # Instead, reload into previous version of model
                    if torch.any(torch.isnan(torch.cat([param.view(-1) for param in self.model.parameters()]))):
                        self.model, self.optimizer = self.model_toolbox.load_model(self.config)    
                        continue

                    # ---- Emitting weights ----
                    self.neuron.metagraph.set_weights(self.row, wait_for_inclusion = True) # Sets my row-weights on the chain.

                    # ---- Sync metagraph ----
                    self.neuron.metagraph.sync() # Pulls the latest metagraph state (with my update.)
                    self.row = self.neuron.metagraph.row

                    # --- Epoch logs ----
                    print(self.neuron.axon.__full_str__())
                    print(self.neuron.dendrite.__full_str__())
                    print(self.neuron.metagraph)

                    # ---- Update Tensorboard ----
                    self.neuron.dendrite.__to_tensorboard__(self.tensorboard, self.global_step)
                    self.neuron.metagraph.__to_tensorboard__(self.tensorboard, self.global_step)
                    self.neuron.axon.__to_tensorboard__(self.tensorboard, self.global_step)
                
                    # ---- Save best loss and model ----
                    if self.training_loss and self.epoch % 10 == 0:
                        if self.training_loss < self.best_train_loss:
                            self.best_train_loss = self.training_loss # update best train loss
                            self.model_toolbox.save_model(
                                self.config.miner.full_path,
                                {
                                    'epoch': self.epoch, 
                                    'model_state_dict': self.model.state_dict(), 
                                    'loss': self.best_train_loss,
                                    'optimizer_state_dict': self.optimizer.state_dict(),
                                }
                            )
                            self.tensorboard.add_scalar('Neuron/Train_loss', self.training_loss, self.global_step)
                    
                # --- Catch Errors ----
                except Exception as e:
                    logger.error('Exception in training script with error: {}', e)
                    logger.info(traceback.print_exc())
                    logger.info('Continuing to train.')
                    time.sleep(1)
    
    # ---- Train Epoch ----
    def train(self):
        self.training_loss = 0.0
        for local_step in range(self.config.miner.epoch_length):
            # ---- Forward pass ----
            inputs, targets = mlm_batch(self.dataset, self.config.miner.batch_size_train, bittensor.__tokenizer__(), self.data_collator)
            output = self.model.remote_forward (
                    self.neuron,
                    inputs = inputs.to(self.model.device), 
                    targets = targets.to(self.model.device)
            )

            # ---- Backward pass ----
            loss = output.local_target_loss + output.distillation_loss + output.remote_target_loss
            loss.backward() # Accumulates gradients on the model.
            self.optimizer.step() # Applies accumulated gradients.
            self.optimizer.zero_grad() # Zeros out gradients for next accummulation

            # ---- Train row weights ----
            batch_weights = torch.mean(output.router.weights, axis = 0) # Average over batch.
            self.row = (1 - 0.03) * self.row + 0.03 * batch_weights # Moving avg update.
            self.row = F.normalize(self.row, p = 1, dim = 0) # Ensure normalization.

            # ---- Step logs ----
            logger.info('GS: {} LS: {} Epoch: {}\tLocal Target Loss: {}\tRemote Target Loss: {}\tDistillation Loss: {}\tAxon: {}\tDendrite: {}',
                    colored('{}'.format(self.global_step), 'red'),
                    colored('{}'.format(local_step), 'blue'),
                    colored('{}'.format(self.epoch), 'green'),
                    colored('{:.4f}'.format(output.local_target_loss.item()), 'green'),
                    colored('{:.4f}'.format(output.remote_target_loss.item()), 'blue'),
                    colored('{:.4f}'.format(output.distillation_loss.item()), 'red'),
                    self.neuron.axon,
                    self.neuron.dendrite)
            logger.info('Codes: {}', output.router.return_codes.tolist())
            
            self.tensorboard.add_scalar('Neuron/Rloss', output.remote_target_loss.item(), self.global_step)
            self.tensorboard.add_scalar('Neuron/Lloss', output.local_target_loss.item(), self.global_step)
            self.tensorboard.add_scalar('Neuron/Dloss', output.distillation_loss.item(), self.global_step)

            # ---- Step increments ----
            self.global_step += 1
            self.training_loss += output.local_target_loss.item()

            # --- Memory clean up ----
            torch.cuda.empty_cache()
            del output