def main(): client_config = ClientConfig( idx=args.idx, master_ip_addr=args.master_ip, action="" ) print("start") loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) tasks = [] task = asyncio.ensure_future(get_init_config(client_config)) tasks.append(task) loop.run_until_complete(asyncio.wait(tasks)) loop.close() train_dataset, test_dataset = datasets.load_datasets(client_config.custom["dataset_type"]) train_loader = utils.create_dataloaders(train_dataset, batch_size=args.batch_size, selected_idxs=client_config.custom["train_data_idxes"]) test_loader = utils.create_dataloaders(test_dataset, batch_size=128, selected_idxs=client_config.custom["test_data_idxes"], shuffle=False) while True: loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) tasks = [] tasks.append( asyncio.ensure_future( local_training(client_config, train_loader, test_loader) ) ) loop.run_until_complete(asyncio.wait(tasks)) loop.close()
def train_autoencoder(): data = utils.load_data(root_dir='./data/', mode='train') data, target, features, date = utils.preprocess_data(data, nn=True) dataset = utils.FinData(data=data, target=target, date=date) p = {'batch_size': 4597, 'dim_1': 231, 'dim_2': 851, 'dim_3': 777, 'dim_4': 192, 'hidden': 50, 'dropout': 0.017122456592972537, 'lr': 0.0013131268366473552, 'activation': nn.GELU, 'label_smoothing': 0.09401544509474698, 'weight_decay': 0.005078413740277699, 'amsgrad': True} train_idx = [i for i in range(len(data))] val_idx = [i for i in range(10000)] dataloaders = utils.create_dataloaders(dataset=dataset, indexes={ 'train': train_idx, 'val': val_idx}, batch_size=p['batch_size']) checkpoint_callback = ModelCheckpoint( dirpath='logs', monitor='t_loss', mode='min', save_top_k=1, period=10) input_size = data.shape[-1] output_size = 1 model = AutoEncoder(input_size=input_size, output_size=output_size, params=p) es = EarlyStopping(monitor='t_loss', patience=10, min_delta=0.0005, mode='min') trainer = pl.Trainer(max_epochs=500, gpus=1, callbacks=[checkpoint_callback, es], precision=16) trainer.fit(model, train_dataloader=dataloaders['train'])
def train_cross_val(p): data_ = load_data(root_dir='./data/', mode='train') data_, target_, features, date = preprocess_data(data_, nn=True) gts = PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=5) input_size = data_.shape[-1] output_size = 1 tb_logger = pl_loggers.TensorBoardLogger('logs/') models = [] for i, (train_idx, val_idx) in enumerate(gts.split(data_, groups=date)): idx = np.concatenate([train_idx, val_idx]) data = copy.deepcopy(data_[idx]) target = copy.deepcopy(target_[idx]) checkpoint_callback = pl.callbacks.ModelCheckpoint(os.path.join( 'models/', "fold_{}".format(i)), monitor="val_auc", mode='max', save_top_k=1, period=10) model = Classifier(input_size=input_size, output_size=output_size, params=p) if p['activation'] == nn.ReLU: model.apply(lambda m: init_weights(m, 'relu')) elif p['activation'] == nn.LeakyReLU: model.apply(lambda m: init_weights(m, 'leaky_relu')) train_idx = [i for i in range(0, max(train_idx) + 1)] val_idx = [i for i in range(len(train_idx), len(idx))] data[train_idx] = calc_data_mean(data[train_idx], './cache', train=True, mode='mean') data[val_idx] = calc_data_mean(data[val_idx], './cache', train=False, mode='mean') dataset = FinData(data=data, target=target, date=date) dataloaders = create_dataloaders(dataset, indexes={ 'train': train_idx, 'val': val_idx }, batch_size=p['batch_size']) es = EarlyStopping(monitor='val_auc', patience=10, min_delta=0.0005, mode='max') trainer = pl.Trainer(logger=tb_logger, max_epochs=500, gpus=1, callbacks=[checkpoint_callback, es], precision=16) trainer.fit(model, train_dataloader=dataloaders['train'], val_dataloaders=dataloaders['val']) torch.save(model.state_dict(), f'models/fold_{i}_state_dict.pth') models.append(model) return models, features
def final_train(p, load=False): data_ = load_data(root_dir='./data/', mode='train') data, target, features, date = preprocess_data(data_, nn=True) input_size = data.shape[-1] output_size = 1 train_idx, val_idx = date[date <= 450].index.values.tolist(), date[ date > 450].index.values.tolist() data[train_idx] = calc_data_mean(data[train_idx], './cache', train=True) data[val_idx] = calc_data_mean(data[val_idx], './cache', train=False) checkpoint_callback = pl.callbacks.ModelCheckpoint( filepath='models/full_train', monitor="val_auc", mode='max', save_top_k=1, period=10) model = Classifier(input_size=input_size, output_size=output_size, params=p) if p['activation'] == nn.ReLU: model.apply(lambda m: init_weights(m, 'relu')) elif p['activation'] == nn.LeakyReLU: model.apply(lambda m: init_weights(m, 'leaky_relu')) dataset = FinData(data, target, date) dataloaders = create_dataloaders(dataset, indexes={ 'train': train_idx, 'val': val_idx }, batch_size=p['batch_size']) es = EarlyStopping(monitor='val_auc', patience=10, min_delta=0.0005, mode='max') trainer = pl.Trainer(max_epochs=500, gpus=1, callbacks=[checkpoint_callback, es], precision=16) trainer.fit(model, train_dataloader=dataloaders['train'], val_dataloaders=dataloaders['val']) torch.save(model.state_dict(), 'models/final_train.pth') return model, features
def train(args, states=None): config_obj = Config(args.config_file) config = config_obj.elements # make training runs deterministic set_seed(seed_value=config['random_seed']) logging.info("Loading datasets...") dataset, labels = load_embeddings(data_path=config['data'], label_path=config['labels']) train_loader, val_loader, test_loader = create_dataloaders( dataset, labels, batch_size=config['batch_size'], random_seed=config['random_seed'], balance=config['correct_imbalance'], ) model = TextCNN( num_classes=config['num_classes'], embedding_size=config['embedding_size'], num_filters=config['num_filters'], dropout_rate=config['dropout'], ) if torch.cuda.is_available(): model.cuda() loss_function = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=config['lr']) best_metric = 0 # loop over the dataset multiple times for epoch in range(1, config['num_epochs'] + 1): logging.info( f"==================== Epoch: {epoch} ====================") running_losses = [] for i, data in enumerate(train_loader, 0): # get the inputs; data is a list of [inputs, labels] inputs, labels = data if torch.cuda.is_available(): inputs, labels = inputs.cuda(), labels.cuda() # zero the parameter gradients before each pass optimizer.zero_grad() # forward probs, classes = model(inputs) # backprop loss = loss_function(probs, labels) loss.backward() # update/optimize optimizer.step() # Log summary running_losses.append(loss.item()) if i % args.log_interval == 0: interval_loss = sum(running_losses) / len(running_losses) logging.info(f"step = {i}, loss = {interval_loss}") running_losses = [] if i % args.test_interval == 0: dev_metric = eval( val_loader, model, loss_function, args.eval_metric, ) if dev_metric > best_metric: best_metric = dev_metric states = { "epoch": epoch, "step": i, "model": model.state_dict(), "optimizer": optimizer.state_dict() } save_model_state(save_dir=args.model_dir, step=i, states=states) print(f"Finished Training, best {args.eval_metric}: {best_metric}")
def optimize(trial: optuna.Trial, data_dict): gts = PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=10) input_size = data_dict['data'].shape[-1] output_size = 5 checkpoint_callback = pl.callbacks.ModelCheckpoint(os.path.join( 'models/', "trial_resnet_{}".format(trial.number)), monitor="val_auc", mode='max') logger = MetricsCallback() metrics = [] sizes = [] # trial_file = 'HPO/nn_hpo_2021-01-05.pkl' trial_file = None p = create_param_dict(trial, trial_file) p['batch_size'] = trial.suggest_int('batch_size', 8000, 15000) for i, (train_idx, val_idx) in enumerate( gts.split(data_dict['data'], groups=data_dict['date'])): idx = np.concatenate([train_idx, val_idx]) data = copy.deepcopy(data_dict['data'][idx]) target = copy.deepcopy(data_dict['target'][idx]) date = copy.deepcopy(data_dict['date'][idx]) train_idx = [i for i in range(0, max(train_idx) + 1)] val_idx = [i for i in range(len(train_idx), len(idx))] data[train_idx] = calc_data_mean(data[train_idx], './cache', train=True, mode='mean') data[val_idx] = calc_data_mean(data[val_idx], './cache', train=False, mode='mean') model = Classifier(input_size, output_size, params=p) # model.apply(init_weights) dataset = FinData(data=data, target=target, date=date, multi=True) dataloaders = create_dataloaders(dataset, indexes={ 'train': train_idx, 'val': val_idx }, batch_size=p['batch_size']) es = EarlyStopping(monitor='val_loss', patience=10, min_delta=0.0005, mode='min') trainer = pl.Trainer(logger=False, max_epochs=500, gpus=1, callbacks=[ checkpoint_callback, logger, PyTorchLightningPruningCallback( trial, monitor='val_loss'), es ], precision=16) trainer.fit(model, train_dataloader=dataloaders['train'], val_dataloaders=dataloaders['val']) val_loss = logger.metrics[-1]['val_loss'].item() metrics.append(val_loss) sizes.append(len(train_idx)) metrics_mean = weighted_mean(metrics, sizes) return metrics_mean
def train(args, states=None): config_obj = Config(args.config_file) config = config_obj.elements # make training runs deterministic set_seed(seed_value=config['random_seed']) logging.info("Loading datasets...") dataset, labels = load_tokens( input_id_path=config['input_id'], token_type_id_path=config['token_type_id'], attention_mask_path=config['attention_mask'], label_path=config['labels'], ) train_loader, val_loader, test_loader = create_dataloaders( dataset, labels, batch_size=config['batch_size'], random_seed=config['random_seed'], balance=config['correct_imbalance'], ) model = BertForSequenceClassification.from_pretrained( "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext", num_labels=2, output_attentions=False, output_hidden_states=False, ) if torch.cuda.is_available(): model.cuda() loss_function = nn.CrossEntropyLoss() # optimizer = AdamW(model.parameters(), lr=config['lr']) optimizer = torch.optim.SGD(model.parameters(), lr=config['lr']) total_train_steps = config['num_epochs'] * len(train_loader) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_train_steps, ) best_metric = 0 # loop over the dataset multiple times for epoch in range(1, config['num_epochs'] + 1): logging.info( f"==================== Epoch: {epoch} ====================") running_losses = [] for i, data in enumerate(train_loader, 0): # get the inputs; data is a list of [inputs, labels] input_ids, token_type_ids, attention_mask, labels = data if torch.cuda.is_available(): input_ids = input_ids.cuda() token_type_ids = token_type_ids.cuda() attention_mask = attention_mask.cuda() labels = labels.cuda() # zero the parameter gradients optimizer.zero_grad() # forward _, logits = model( input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels, ) # probs = F.softmax(logits, dim=1) # backprop loss = loss_function(logits, labels) loss.backward() # clip gradients torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # update/optimize optimizer.step() # update learning rate scheduler.step() # Log summary running_losses.append(loss.item()) if i % args.log_interval == 0: interval_loss = sum(running_losses) / len(running_losses) logging.info(f"step = {i}, loss = {interval_loss}") running_losses = [] if i % args.test_interval == 0: dev_metric = eval( val_loader, model, loss_function, args.eval_metric, ) if dev_metric > best_metric: best_metric = dev_metric states = { "epoch": epoch, "step": i, "model": model.state_dict(), "optimizer": optimizer.state_dict() } save_model_state(save_dir=args.model_dir, step=i, states=states) print(f"Finished Training, best {args.eval_metric}: {best_metric}")
args = parser.parse_args() if __name__ == "__main__": # get the values from the params data_dir = args.data_dir checkpoint_path = args.checkpoint_path arch = args.arch lr = args.learning_rate dropout = args.dropout hidden_units = args.hidden_units epochs = args.epochs gpu = args.gpu print("These are the arguments supplied! :\n {}".format(args)) image_datasets, dataloaders = utils.create_dataloaders(data_dir) # nn model stuff supported_archs = utils.get_supported_archs() if arch not in supported_archs: print( "'{}' not supported. Please choose either vgg16 or alexnet".format( arch)) else: pretrained_model = supported_archs[arch][0] model_input = supported_archs[arch][1] food_classifier = utils.setup_nn(pretrained_model, model_input, hidden_units, dropout)