def train(hidden_size, num_layers, lr, weight_decay): region = "germany" log_name = log_pattern.format(region=region, num_layers=num_layers, hidden_size=hidden_size, lr=lr, weight_decay=weight_decay) log_path = os.path.join(log_dir, log_name) if os.path.exists(log_path): print(f"{log_path} exists. skipping...") return try: model, dataset, validdataset, dataloader, validdataloader, optimizer = setup( hidden_size, num_layers, lr, weight_decay) stats = list() for epoch in range(epochs): trainloss = train_epoch(model, dataloader, optimizer, criterion, device) testmetrics, testloss = test_epoch(model, validdataloader, device, criterion, n_predictions=1) metric_msg = ", ".join([ f"{name}={metric.compute():.2f}" for name, metric in testmetrics.items() ]) msg = f"epoch {epoch}: train loss {trainloss:.2f}, test loss {testloss:.2f}, {metric_msg}" print(msg) #test_model(model, validdataset, device) model_name = name_pattern.format(region=region, num_layers=num_layers, hidden_size=hidden_size, lr=lr, weight_decay=weight_decay, epoch=epoch) pth = os.path.join(model_dir, model_name + ".pth") print(f"saving model snapshot to {pth}") snapshot(model, optimizer, pth) stat = dict() stat["epoch"] = epoch for name, metric in testmetrics.items(): stat[name] = metric.compute() stat["trainloss"] = trainloss.cpu().detach().numpy() stat["testloss"] = testloss.cpu().detach().numpy() stats.append(stat) finally: df = pd.DataFrame(stats) df.to_csv(log_path) print(f"saving log to {log_path}")
def main(args): mode = "evaluation" + str(args.fold) traindataloader, testdataloader, meta = get_dataloader( args.datapath, mode, args.batchsize, args.workers, level=args.level, preload_ram=args.preload_ram) num_classes = meta["num_classes"] ndims = meta["ndims"] sequencelength = meta["sequencelength"] print(f"Logging results to {args.logdir}") logdir = os.path.join(args.logdir, str(args.fold)) os.makedirs(logdir, exist_ok=True) epochs, learning_rate, weight_decay = select_hyperparameter(args.model) device = torch.device(args.device) model = get_model(args.model, ndims, num_classes, sequencelength, device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) model.modelname += f"_learning-rate={learning_rate}_weight-decay={weight_decay}" print(f"Initialized {model.modelname}") criterion = torch.nn.CrossEntropyLoss(reduction="mean") for epoch in range(epochs): print(f"train epoch {epoch}") train_epoch(model, optimizer, criterion, traindataloader, device) losses, y_true, y_pred, y_score, field_ids = test_epoch( model, criterion, dataloader=testdataloader, device=device) logdir = os.path.join(logdir, args.model) os.makedirs(logdir, exist_ok=True) print(f"saving results to {logdir}") print(sklearn.metrics.classification_report(y_true.cpu(), y_pred.cpu()), file=open(os.path.join(logdir, "classification_report.txt"), "w")) np.save(os.path.join(logdir, "y_pred.npy"), y_pred.cpu().numpy()) np.save(os.path.join(logdir, "y_true.npy"), y_true.cpu().numpy()) np.save(os.path.join(logdir, "y_score.npy"), y_score.cpu().numpy()) np.save(os.path.join(logdir, "field_ids.npy"), field_ids.numpy()) save(model, os.path.join(logdir, model.modelname + ".pth"))
def test(model, data_path, label_file, save, batch_size): data_list = os.listdir(data_path) dataset = octDataset(data_path, data_list, label_file, argument=False) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=dataset.collate_fn) if torch.cuda.is_available(): model = model.cuda() model.load_state_dict(torch.load(os.path.join(save, 'v1_test_model.dat'))) _, loss, error = test_epoch(model, dataloader, is_test=True) return error.avg
# dataset train_data, test_data = data.train_loader(args, kwargs), data.test_loader( args, kwargs) # model net = model.MnistClassifer().to(device) print(net) # optimizer optimizer = optim.SGD(net.parameters(), lr=args.lr) # optimizer = optim.Adam(net.parameters(), lr = 0.0001) # train train_losses, train_accuracies = [], [] test_losses, test_accuracies = [], [] for epoch in range(1, args.epochs + 1): print("Epoch: {}".format(epoch)) train_loss, train_accuracy = train_epoch(args, net, device, train_data, optimizer, epoch) test_loss, test_accuracy = test_epoch(args, net, device, test_data) train_losses.append(train_loss) test_losses.append(test_loss) train_accuracies.append(train_accuracy) test_accuracies.append(test_accuracy) save_graph_image(epoch, train_losses, train_accuracies, test_losses, test_accuracies) torch.save(net.state_dict(), 'mnist_model_params.pth')
def main(): best_prec1 = 0 test = True log = True save_best = True sample_length = 0.5 num_samples = np.int(np.round( 5000 / sample_length)) # together I want about 5000 seconds from each subject batch_size = 100 num_epochs = 200 dropout = 0.4 task = 'subject_prediction' os.environ["CUDA_VISIBLE_DEVICES"] = "1" torch.backends.cudnn.benchmark = True root_path = pathlib.Path.cwd() matrix = root_path.joinpath( 'data', f'cleaned_{sample_length}sec_{num_samples}.npy') training_dataset = LFPData(data_file=matrix, split='train', standardize=True) training_loader = DataLoader(training_dataset, shuffle=True, batch_size=batch_size, pin_memory=True, num_workers=1) validation_set = LFPData(data_file=matrix, split='valid', standardize=True) validation_loader = DataLoader(validation_set, shuffle=False, batch_size=batch_size, pin_memory=True, num_workers=1) # input_shape = (2, np.int(422 * sample_length)) # this is a hack to figure out shape of fc layer # net = conv1d_nn.Net(input_shape=input_shape, dropout=dropout) net = conv1d_nn.FCN(in_channels=2, num_classes=9) net.apply(init_weights) net.cuda() criterion = nn.CrossEntropyLoss() criterion.cuda() # optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) optimizer = optim.Adam(net.parameters(), lr=1e-3, betas=(0.9, 0.999), eps=1e-8) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, threshold=1e-2) stop_criterion = EarlyStopping() title = f'FCN2_cleaned_{sample_length}sec_{num_samples}' if log: log_dir = root_path.joinpath('logs', title) if not log_dir.exists(): log_dir.mkdir() training_log = log_dir.joinpath('log') if not training_log.exists(): open(str(training_log), 'w').close() result_writer = ResultsWriter(str(training_log), overwrite=True) mlog = MeterLogger(server='localhost', port=8097, nclass=9, title=title, env=title) for epoch in range(1, num_epochs + 1): mlog.timer.reset() train_epoch(training_loader, net, criterion, optimizer, mlog) if log: result_writer.update(title, {'Train': mlog.peek_meter()}) mlog.print_meter(mode="Train", iepoch=epoch) mlog.reset_meter(mode="Train", iepoch=epoch) validation_loss = val_epoch(validation_loader, net, criterion, mlog) prec1 = mlog.meter['accuracy'].value()[0] if save_best: # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 if is_best: best_prec1 = max(prec1, best_prec1) save_checkpoint( root_path.joinpath('checkpoints', title), { 'epoch': epoch + 1, 'state_dict': net.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best) if log: result_writer.update(title, {'Validation': mlog.peek_meter()}) mlog.print_meter(mode="Test", iepoch=epoch) mlog.reset_meter(mode="Test", iepoch=epoch) stop_criterion.eval_loss(validation_loss) if stop_criterion.get_nsteps() >= 30: print('Early stopping') break print(optimizer.param_groups[0]['lr']) scheduler.step(validation_loss) print('Training finished', best_prec1) if test: test_set = LFPData(data_file=matrix, split='test', standardize=True) test_loader = DataLoader(test_set, shuffle=False, batch_size=batch_size, pin_memory=True, num_workers=1) test_loss, test_acc = test_epoch(test_loader, net, criterion, mlog) result_writer.update( title, {'Test': { 'loss': test_loss, 'accuracy': test_acc }}) print(test_loss, test_acc) # save pngs of visdom plot into log path plot_visdom(mlog, log_dir)
def main(args): method = args.method pretrainEmbedding = args.pretrainEmbedding makeCSVfile = args.makeCSVfile if makeCSVfile: make_csv_file_from_rawtext() tokenize = lambda x: x.split() TEXT = Field(sequential=True, use_vocab=True, tokenize=tokenize, lower=True, pad_first=True) LABEL = Field(sequential=False, use_vocab=False, pad_token=None, unk_token=None) tv_datafield = [("id", None), ("text", TEXT), ("label", LABEL)] train_data = TabularDataset(path='./data/train_log.csv', format='csv', fields=[('text', TEXT), ('label', LABEL)], skip_header=True) test_data = TabularDataset(path='./data/test_log.csv', format='csv', fields=[('text', TEXT), ('label', LABEL)], skip_header=True) if pretrainEmbedding: vectors = Vectors(name="./data/all.review.vec.txt", cache='./') TEXT.build_vocab(train_data, max_size=10000, vectors=vectors) else: TEXT.build_vocab(train_data, max_size=10000) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') vocab_size = len(TEXT.vocab) if method == "RNN": model = LSTMBaseline(vocab_size) else: model = CNNBaseline(vocab_size) model = model.to(device) traindl, testdl = torchtext.data.BucketIterator.splits( datasets=(train_data, test_data), # specify train and validation Tabulardataset batch_sizes=(32, 32), # batch size of train and validation sort_key=lambda x: len( x.text), # on what attribute the text should be sorted device=device, # -1 mean cpu and 0 or None mean gpu sort_within_batch=True, repeat=False) optimizer = optim.Adam(model.parameters(), lr=1e-4) criterion = nn.BCEWithLogitsLoss() epochs = 100 trainAccuracy = [] testAccuracy = [] trainLoss = [] testLoss = [] trainTime = 0 for epoch in range(1, epochs + 1): loss, acc = train_epoch(model, traindl, optimizer, criterion) trainLoss.append(loss) trainAccuracy.append(acc) loss, acc = test_epoch(model, testdl, optimizer, criterion) testLoss.append(loss) testAccuracy.append(acc) print("train Accuracy :", trainAccuracy[-1].item()) print("test Accuracy :", testAccuracy[-1].item())
optimizer = optim.SGD(model.parameters(), lr=cfg.lr) # 打开日志文件 log = log_txt(path=cfg.work_dir, description=cfg.exp) # 加载预训练参数 start_epoch = 0 if cfg.resume_from is not None: print('loading pretrained model from %s' % cfg.resume_from) checkpoint = torch.load(cfg.resume_from) model.load_state_dict(checkpoint['state_dict'], strict=False) start_epoch = checkpoint['epoch'] # 训练和验证模式 dataloader = load_data(cfg) train_loader = dataloader['train'] test_loader = dataloader['test'] train4val_loader = dataloader['train4val'] # 配置训练策略 iter_per_epoch = len(train_loader) train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=cfg.milestone, gamma=0.5) for i in range(0, cfg.epoch_size): train_epoch(i, model, train_loader, criterion, optimizer, cfg, log, train_scheduler) test_epoch(model, train4val_loader, test_loader, L1_measure, cfg, log, i)
def main(): best_prec1 = 0 test = True transfer_learning = True batch_size = 50 sample_length = 3 num_epochs = 50 task = 'state_prediciton' os.environ["CUDA_VISIBLE_DEVICES"] = "1" torch.backends.cudnn.benchmark = True root_path = pathlib.Path.home().joinpath('deep_LFP') matrix = root_path.joinpath( 'data', f'cleaned_state_matrix_{sample_length}sec.npz') training_dataset = LFPDataStates(data_file=matrix, split='train', standardize=True) training_loader = DataLoader(training_dataset, shuffle=True, batch_size=batch_size, pin_memory=True, num_workers=1) validation_set = LFPDataStates(data_file=matrix, split='valid', standardize=True) validation_loader = DataLoader(validation_set, shuffle=False, batch_size=batch_size, pin_memory=True, num_workers=1) input_shape = (2, np.int(422 * sample_length) ) # this is a hack to figure out shape of fc layer net = conv1d_nn.Net(input_shape=input_shape, dropout=0) if transfer_learning: num_samples_prev_model = np.int(np.round(5000 / sample_length)) previous_model = f'cleaned_{sample_length}sec_{num_samples_prev_model}_model_best.pth.tar' previous_model_weights = os.path.join(root_path, 'checkpoints', previous_model) net.load_state_dict(torch.load(previous_model_weights)['state_dict']) for param in net.parameters(): param.requires_grad = False num_features = net.fc1.in_features net.fc1 = nn.Linear(num_features, 2040) net.fc2 = nn.Linear(2040, 4) net.cuda() criterion = nn.CrossEntropyLoss() criterion.cuda() optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=100, threshold=1e-3) stop_criterion = EarlyStopping() title = f'cleaned_state_prediction_{sample_length}sec_transfer_learning' training_log_path = '/data/eaxfjord/deep_LFP/logs/' + title + '/log' log_dir = os.path.dirname(training_log_path) if not os.path.exists(log_dir): os.mkdir(log_dir) if not os.path.exists(training_log_path): open(training_log_path, 'w').close() result_writer = ResultsWriter(training_log_path, overwrite=True) mlog = MeterLogger(server='localhost', port=8097, nclass=4, title=title, env=f'state_prediction_{sample_length}sec') for epoch in range(1, num_epochs + 1): mlog.timer.reset() train_epoch(training_loader, net, criterion, optimizer, mlog) result_writer.update(task, {'Train': mlog.peek_meter()}) mlog.print_meter(mode="Train", iepoch=epoch) mlog.reset_meter(mode="Train", iepoch=epoch) validation_loss = val_epoch(validation_loader, net, criterion, mlog) prec1 = mlog.meter['accuracy'].value()[0] # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 if is_best: best_prec1 = max(prec1, best_prec1) save_checkpoint( os.path.join(root_path, 'checkpoints', title), { 'epoch': epoch + 1, 'state_dict': net.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best) result_writer.update(task, {'Validation': mlog.peek_meter()}) mlog.print_meter(mode="Test", iepoch=epoch) mlog.reset_meter(mode="Test", iepoch=epoch) # stop_criterion.eval_loss(validation_loss) # if stop_criterion.get_nsteps() >= 30: # print('Early stopping') # break print(optimizer.param_groups[0]['lr']) scheduler.step(validation_loss) print('Training finished', best_prec1) if test: test_set = LFPDataStates(data_file=matrix, split='test', standardize=True) test_loader = DataLoader(test_set, shuffle=False, batch_size=batch_size, pin_memory=True, num_workers=1) test_loss, test_acc = test_epoch(test_loader, net, criterion, mlog) result_writer.update(task, {'Test': test_acc}) print(test_loss, test_acc) # when finished get data from visdom plot, and save to png plot_visdom(mlog, log_dir)
regularizer)) # model.eval() mse, preds, trues = eval_epoch(valid_loader, model, loss_fun) valid_mse.append(mse) if valid_mse[-1] < min_mse: min_mse = valid_mse[-1] best_model = model torch.save(best_model, "model.pth") end = time.time() if (len(train_mse) > 50 and np.mean(valid_mse[-5:]) >= np.mean(valid_mse[-10:-5])): break print(train_mse[-1], valid_mse[-1], round((end - start) / 60, 5)) print(time_range, min_mse) loss_fun = torch.nn.MSELoss() best_model = torch.load("model.pth") test_set = Dataset(test_indices, input_length + time_range - 1, 40, 60, test_direc, True) test_loader = data.DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=8) preds, trues, loss_curve = test_epoch(test_loader, best_model, loss_fun) torch.save({ "preds": preds, "trues": trues, "loss_curve": loss_curve }, "results.pt")
def main(): logging.basicConfig( level=logging.DEBUG, format='%(asctime)s:%(process)d:%(levelname)s:%(name)s:%(message)s') parser = argparse.ArgumentParser(description='GMRT CNN Training') parser.add_argument('--batch-size', type=int, default=20000, metavar='N', help='input batch size for training (default: 20000)') parser.add_argument('--epochs', type=int, default=5, metavar='N', help='number of epochs to train (default: 5)') parser.add_argument('--learning-rate', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--keep-probability', type=float, default=0.6, metavar='K', help='Dropout keep probability (default: 0.6)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--num-processes', type=int, default=4, metavar='N', help='how many training processes to use (default: 4)') parser.add_argument('--use-gpu', action='store_true', default=False, help='use the GPU if it is available') parser.add_argument('--data-path', default='./data', help='the path to the data file') parser.add_argument('--data-file', default='data.h5', help='the name of the data file') parser.add_argument('--sequence-length', type=int, default=10, help='how many elements in a sequence') parser.add_argument('--validation-percentage', type=int, default=10, help='amount of data used for validation') parser.add_argument('--training-percentage', type=int, default=80, help='amount of data used for training') parser.add_argument('--seed', type=int, default=None, metavar='S', help='random seed (default: 1)') parser.add_argument('--learning-rate-decay', type=float, default=0.8, metavar='LRD', help='the initial learning rate decay rate') parser.add_argument('--start-learning-rate-decay', type=int, default=5, help='the epoch to start applying the LRD') parser.add_argument('--short_run', type=int, default=None, help='use a short run of the test data') parser.add_argument('--save', type=str, default=None, help='path to save the final model') kwargs = vars(parser.parse_args()) LOGGER.debug(kwargs) # If the have specified a seed get a random if kwargs['seed'] is not None: np.random.seed(kwargs['seed']) else: np.random.seed() if kwargs['use_gpu'] and torch.cuda.is_available(): LOGGER.info('Using cuda devices: {}'.format(torch.cuda.device_count())) kwargs['cuda_device_count'] = torch.cuda.device_count() kwargs['using_gpu'] = True else: LOGGER.info('Using CPU') kwargs['cuda_device_count'] = 0 kwargs['using_gpu'] = False # Do this first so all the data is built before we go parallel and get race conditions with Timer('Checking/Building data file'): build_data(**kwargs) rfi_data = RfiData(**kwargs) if kwargs['using_gpu']: # The DataParallel will distribute the model to all the available GPUs # model = nn.DataParallel(GmrtCNN(kwargs['keep_probability'])).cuda() model = nn.DataParallel( GmrtLinear(kwargs['keep_probability'], kwargs['sequence_length'])).cuda() # Train train(model, rfi_data, **kwargs) else: # This uses the HOGWILD! approach to lock free SGD # model = GmrtCNN(kwargs['keep_probability']) model = GmrtLinear(kwargs['keep_probability'], kwargs['sequence_length']) model.share_memory( ) # gradients are allocated lazily, so they are not shared here processes = [] for rank in range(kwargs['num_processes']): p = mp.Process(target=train, args=(model, rfi_data, rank), kwargs=kwargs) p.start() processes.append(p) for p in processes: p.join() with Timer('Reading final test data'): test_loader = data.DataLoader( rfi_data.get_rfi_dataset('test', short_run_size=kwargs['short_run']), batch_size=kwargs['batch_size'], num_workers=1, pin_memory=kwargs['using_gpu'], ) with Timer('Final test'): test_epoch(model, test_loader, kwargs['log_interval']) if kwargs['save'] is not None: with Timer('Saving model'): with open(kwargs['save'], 'wb') as save_file: torch.save(model.state_dict(), save_file)