def forward(self, x): # pylint: disable=W ''' :param x: [batch, features, x, y, z] ''' x = self.bn_in(x.contiguous()) t = time_logging.start() for i, conv in enumerate(self.convolutions): x = conv(x) t = time_logging.end("block {}".format(i), t) # [batch, features] x = x.view(x.size(0), x.size(1), -1).max(-1)[0] x = self.bn_out(x.contiguous()) return x
def forward(self, inp): # pylint: disable=W ''' :param inp: [batch, features, x, y, z] ''' x = inp t = time_logging.start() for i in range(len(self.block_params)): #logger.info("%d: %f +- %f", i, x.data.mean(), x.data.std()) block = getattr(self, 'block{}'.format(i)) x = block(x) t = time_logging.end("block {}".format(i), t) x = x.view(x.size(0), x.size(1), -1) # [batch, features, x*y*z] x = x.sum(-1) # [batch, features] return x
def forward(self, inp): # pylint: disable=W ''' :param inp: [batch, features, x, y, z] ''' x = inp t = time_logging.start() for i in range(len(self.block_params)): block = getattr(self, 'block{}'.format(i)) x = block(x) t = time_logging.end("block {}".format(i), t) x = x.view(x.size(0), x.size(1), -1) # [batch, features, x*y*z] x = x.mean(-1) # [batch, features] x = x * self.alpha * 0.1 inp = inp.view(inp.size(0), inp.size(1), -1).sum(-1) y = self.lin(inp) return x + y
def train_one_epoch(epoch, model, train_files, optimizer, criterion, number_of_process): cnn = model.get_cnn() bs = model.get_batch_size(epoch) logger = logging.getLogger("trainer") indicies = list(range(len(train_files))) random.shuffle(indicies) queue = torch.multiprocessing.Queue(maxsize=QUEUE_SIZE) event_done = torch.multiprocessing.Event() class Batcher(torch.multiprocessing.Process): def __init__(self, n=1, i=0): super().__init__(daemon=True) self.n = n self.i = i def run(self): s = 0 for i in range(0, len(train_files), bs): if s % self.n == self.i: j = min(i + bs, len(train_files)) gc.collect() x, y = model.load_train_files( [train_files[g] for g in indicies[i:j]]) queue.put((x, y)) s += 1 event_done.wait() for i in range(number_of_process): batcher = Batcher(number_of_process, i) batcher.start() losses = [] cnn.train() if torch.cuda.is_available(): cnn.cuda() for i in range(0, len(train_files), bs): t0 = perf_counter() gc.collect() t = time_logging.start() x, y = queue.get() x = torch.FloatTensor(x) y = torch.FloatTensor(y) x = torch.autograd.Variable(x) y = torch.autograd.Variable(y) if torch.cuda.is_available(): x = x.cuda() y = y.cuda() t = time_logging.end("batch", t) optimizer.zero_grad() outputs = cnn(x) loss = criterion(outputs, y) t = time_logging.end("forward", t) loss.backward() optimizer.step() t = time_logging.end("backward", t) loss_ = float(loss.data.cpu().numpy()) losses.append(loss_) logger.info( "[%d.%.2d|%d/%d] RMSE=%.1e <RMSE>=%.1e Queue=%d Memory=%s Time=%.2fs", epoch, 100 * i // len(train_files), i, len(train_files), loss_**0.5, np.mean(losses)**0.5, queue.qsize(), gpu_memory.format_memory(gpu_memory.used_memory()), perf_counter() - t0) del x del y del outputs del loss event_done.set() return np.mean(losses)
def train(args): if os.path.isdir(args.log_dir): print("{} exists already".format(args.log_dir)) return os.mkdir(args.log_dir) logger = logging.getLogger("trainer") logger.setLevel(logging.DEBUG) logger.handlers = [] ch = logging.StreamHandler() logger.addHandler(ch) fh = logging.FileHandler(os.path.join(args.log_dir, "log.txt")) logger.addHandler(fh) logger.info("Arguments = %s", repr(args)) ############################################################################ # Files and labels train_data = None eval_datas = [] if args.train_data_path is not None: train_data = load_data(args.train_data_path) logger.info("%d training files", len(train_data.files)) if args.eval_data_path is not None: for pattern in args.eval_data_path: eval_data = load_data(pattern) eval_datas.append(eval_data) logger.info("%d evaluation files", len(eval_data.files)) ############################################################################ # Import model model_path = shutil.copy2(args.model_path, os.path.join(args.log_dir, "model.py")) module = import_module(model_path) model = module.MyModel() model.initialize() cnn = model.get_cnn() logger.info("There is %d parameters to optimize", sum([x.numel() for x in cnn.parameters()])) if args.restore_path is not None: restore_path = shutil.copy2( os.path.join(args.restore_path, "model.pkl"), os.path.join(args.log_dir, "model.pkl")) checkpoint = torch.load(restore_path) args.start_epoch = checkpoint['epoch'] cnn.load_state_dict(checkpoint['state_dict']) logger.info("Restoration from file %s", os.path.join(args.restore_path, "model.pkl")) ############################################################################ # Only evaluation if train_data is None: if args.restore_path is None: logger.info("Evalutation with randomly initialized parameters") for i, data in enumerate(eval_datas): outputs, targets = evaluate( model, data.files, number_of_process=args.number_of_process) save_evaluation(data.ids, outputs, args.log_dir, i) rmse = np.mean((outputs - targets)**2)**0.5 logger.info("Evaluation RMSE = %f", rmse) return ############################################################################ # Optimizer optimizer = model.get_optimizer() criterion = model.get_criterion() if torch.cuda.is_available(): criterion.cuda() if args.restore_path is not None: checkpoint = torch.load(os.path.join(args.restore_path, "model.pkl")) optimizer.load_state_dict(checkpoint['optimizer']) ############################################################################ # Training statistics_train = [] statistics_eval = [[] for _ in eval_datas] IPython.embed() for epoch in range(args.start_epoch, args.number_of_epochs): time_logging.clear() t = time_logging.start() lr = model.get_learning_rate(epoch) for param_group in optimizer.param_groups: param_group['lr'] = lr avg_loss = train_one_epoch(epoch, model, train_data.files, optimizer, criterion, args.number_of_process) statistics_train.append([epoch, avg_loss]) model.training_done(avg_loss) time_logging.end("training epoch", t) logger.info("%s", time_logging.text_statistics()) cnn.cpu() path = os.path.join(args.log_dir, 'model.pkl') torch.save( { 'epoch': epoch + 1, 'state_dict': cnn.state_dict(), 'optimizer': optimizer.state_dict(), }, path) logger.info("Saved in %s", path) if epoch % args.eval_each == args.eval_each - 1: for i, (data, stat) in enumerate(zip(eval_datas, statistics_eval)): outputs, targets = evaluate( model, data.files, epoch, number_of_process=args.number_of_process) save_evaluation(data.ids, outputs, args.log_dir, i) rmse = np.mean((outputs - targets)**2)**0.5 logger.info("Evaluation RMSE = %f", rmse) stat.append([epoch, rmse]) statistics_train = np.array(statistics_train) np.save(os.path.join(args.log_dir, "statistics_train.npy"), statistics_train) statistics_eval = np.array(statistics_eval) np.save(os.path.join(args.log_dir, "statistics_eval.npy"), statistics_eval)
def train_one_epoch(epoch, model, train_files, train_labels, optimizer, criterion, number_of_process, queue_size): cnn = model.get_cnn() logger = logging.getLogger("trainer") batches = model.create_train_batches( epoch, train_files, train_labels) # list of lists [first batch, second batch, ...] queue = torch.multiprocessing.Queue(maxsize=queue_size) event_done = torch.multiprocessing.Event() class Batcher(torch.multiprocessing.Process): def __init__(self, n=1, i=0): super().__init__(daemon=True) self.n = n self.i = i def run(self): for s, batch in enumerate(batches): if s % self.n == self.i: gc.collect() x, y = model.load_train_batch(batch) queue.put((x, y)) event_done.wait() for i in range(number_of_process): batcher = Batcher(number_of_process, i) batcher.start() losses = [] total_correct = 0 total_trained = 0 cnn.train() if torch.cuda.is_available(): cnn.cuda() for s, batch in enumerate(batches): t0 = perf_counter() gc.collect() t = time_logging.start() x, y = queue.get() x = torch.autograd.Variable(x) y = torch.autograd.Variable(y) t = time_logging.end("load batch", t) if torch.cuda.is_available(): x = x.cuda() y = y.cuda() t = time_logging.end("upload batch", t) optimizer.zero_grad() outputs = cnn(x) loss = criterion(outputs, y) t = time_logging.end("forward", t) loss.backward() optimizer.step() t = time_logging.end("backward", t) loss_ = float(loss.data.cpu().numpy()) losses.append(loss_) if outputs.size(-1) > 1: if y.dim() == 1: correct = sum(outputs.data.cpu().numpy().argmax(-1) == y.data.cpu().numpy()) else: correct = sum(outputs.data.cpu().numpy().argmax(-1) == y.data.cpu().numpy().argmax(-1)) else: correct = np.sum( np.sign(outputs.data.cpu().numpy().reshape(( -1, ))) == 2 * y.data.cpu().numpy() - 1) total_correct += correct total_trained += len(batch) logger.info( "[%d.%.2d|%d/%d] Loss=%.1e <Loss>=%.1e Accuracy=%d/%d <Accuracy>=%.2f%% Queue=%d Memory=%s Time=%.2fs", epoch, 100 * s // len(batches), s, len(batches), loss_, np.mean(losses), correct, len(batch), 100 * total_correct / total_trained, queue.qsize(), gpu_memory.format_memory(gpu_memory.used_memory()), perf_counter() - t0) del x del y del outputs del loss event_done.set() return (np.mean(losses), total_correct / total_trained)
def train(args): if os.path.isdir(args.log_dir): print("{} exists already".format(args.log_dir)) return os.mkdir(args.log_dir) logger = logging.getLogger("trainer") logger.setLevel(logging.DEBUG) logger.handlers = [] ch = logging.StreamHandler() logger.addHandler(ch) fh = logging.FileHandler(os.path.join(args.log_dir, "log.txt")) logger.addHandler(fh) logger.info("Arguments = %s", repr(args)) ############################################################################ # Files and labels classes = None train_data = None eval_datas = [] if args.train_csv_path is not None or args.train_data_path is not None: train_data, classes = load_data_with_csv(args.train_csv_path, args.train_data_path, classes) logger.info( "%s=%d training files", "+".join([ str(train_data.labels.count(x)) for x in set(train_data.labels) ]), len(train_data.files)) if args.eval_data_path is not None and args.eval_csv_path is not None: assert len(args.eval_data_path) == len(args.eval_csv_path) for csv_file, pattern in zip(args.eval_csv_path, args.eval_data_path): eval_data, classes = load_data_with_csv(csv_file, pattern, classes) eval_datas.append(eval_data) logger.info( "%s=%d evaluation files", "+".join([ str(eval_data.labels.count(x)) for x in set(eval_data.labels) ]), len(eval_data.files)) elif args.eval_data_path is not None and args.eval_csv_path is None: for pattern in args.eval_data_path: eval_data = load_data(pattern) eval_datas.append(eval_data) logger.info("%d evaluation files", len(eval_data.files)) elif args.eval_data_path is None and args.eval_csv_path is None: pass else: raise AssertionError("eval_data_path or eval_csv_path missing ?") if args.number_of_classes is not None and classes is None: classes = list(range(args.number_of_classes)) ############################################################################ # Import model model_path = shutil.copy2(args.model_path, os.path.join(args.log_dir, "model.py")) module = import_module(model_path) model = module.MyModel() model.initialize(number_of_classes=len(classes)) cnn = model.get_cnn() logger.info("There is %d parameters to optimize", sum([x.numel() for x in cnn.parameters()])) if args.restore_path is not None: restore_path = shutil.copy2( os.path.join(args.restore_path, "model.pkl"), os.path.join(args.log_dir, "model.pkl")) checkpoint = torch.load(restore_path) args.start_epoch = checkpoint['epoch'] cnn.load_state_dict(checkpoint['state_dict']) logger.info("Restoration from file %s", os.path.join(args.restore_path, "model.pkl")) ############################################################################ # Only evaluation if train_data is None: if args.restore_path is None: logger.info("Evalutation with randomly initialized parameters") for i, data in enumerate(eval_datas): time_logging.clear() t = time_logging.start() outputs = evaluate(model, data.files, -1, args.number_of_process, args.queue_size) time_logging.end("evaluation", t) logger.info("%s", time_logging.text_statistics()) save_evaluation(data.ids, outputs, data.labels, args.log_dir, i) if data.labels is not None: if outputs.shape[-1] > 1: correct = np.sum( np.argmax(outputs, axis=1) == np.array( data.labels, np.int64)) else: correct = np.sum( np.sign(outputs).reshape(( -1, )) == 2 * np.array(data.labels, np.int64) - 1) logger.info("%d / %d = %.2f%%", correct, len(data.labels), 100 * correct / len(data.labels)) return ############################################################################ # Optimizer optimizer = model.get_optimizer() criterion = model.get_criterion() train_criterion = model.get_train_criterion() if torch.cuda.is_available(): criterion.cuda() if args.restore_path is not None: checkpoint = torch.load(os.path.join(args.restore_path, "model.pkl")) optimizer.load_state_dict(checkpoint['optimizer']) ############################################################################ # Training statistics_train = [] statistics_eval = [[] for _ in eval_datas] if args.restore_path is not None: statistics_train = list( np.load(os.path.join(args.restore_path, "statistics_train.npy"))) statistics_eval = [ list(x) for x in np.load( os.path.join(args.restore_path, "statistics_eval.npy")) ] logger.info("Restoration from numpy statistics files") if args.number_of_epochs is not None: number_of_epochs = args.number_of_epochs else: number_of_epochs = model.number_of_epochs() IPython.embed() for epoch in range(args.start_epoch, number_of_epochs): time_logging.clear() t = time_logging.start() lr = model.get_learning_rate(epoch) for param_group in optimizer.param_groups: param_group['lr'] = lr avg_loss, accuracy = train_one_epoch(epoch, model, train_data.files, train_data.labels, optimizer, train_criterion, args.number_of_process, args.queue_size) statistics_train.append([epoch, avg_loss, accuracy]) model.training_done(avg_loss) time_logging.end("training epoch", t) logger.info("%s", time_logging.text_statistics()) cnn.cpu() path = os.path.join(args.log_dir, 'model.pkl') torch.save( { 'epoch': epoch + 1, 'state_dict': cnn.state_dict(), 'optimizer': optimizer.state_dict(), }, path) logger.info("Saved in %s", path) if epoch % args.eval_each == args.eval_each - 1: for i, (data, stat) in enumerate(zip(eval_datas, statistics_eval)): time_logging.clear() t = time_logging.start() outputs = evaluate(model, data.files, epoch, args.number_of_process, args.queue_size) time_logging.end("evaluation", t) logger.info("%s", time_logging.text_statistics()) save_evaluation(data.ids, outputs, data.labels, args.log_dir, i) if data.labels is not None: if outputs.shape[-1] > 1: correct = np.sum( np.argmax(outputs, axis=1) == np.array( data.labels, np.int64)) else: correct = np.sum( np.sign(outputs).reshape(( -1, )) == 2 * np.array(data.labels, np.int64) - 1) criterion.cpu() loss = criterion( torch.autograd.Variable(torch.FloatTensor(outputs)), torch.autograd.Variable(torch.LongTensor( data.labels))).data[0] if torch.cuda.is_available(): criterion.cuda() logger.info( "Evaluation accuracy %d / %d = %.2f%%, Loss = %1e", correct, len(data.labels), 100 * correct / len(data.labels), loss) stat.append([epoch, loss, correct / len(data.labels)]) np.save(os.path.join(args.log_dir, "statistics_train.npy"), np.array(statistics_train)) np.save(os.path.join(args.log_dir, "statistics_eval.npy"), np.array(statistics_eval))
def evaluate(model, files, epoch, number_of_process, queue_size): cnn = model.get_cnn() bs = model.get_batch_size(epoch) logger = logging.getLogger("trainer") queue = torch.multiprocessing.Queue(maxsize=queue_size) event_done = torch.multiprocessing.Event() class Batcher(torch.multiprocessing.Process): def __init__(self, n=1, i=0): super().__init__(daemon=True) self.n = n self.i = i def run(self): s = 0 for i in range(0, len(files), bs): if s % self.n == self.i: j = min(i + bs, len(files)) gc.collect() x = model.load_eval_files(files[i:j]) queue.put((s, x)) s += 1 event_done.wait() for i in range(number_of_process): batcher = Batcher(number_of_process, i) batcher.start() cnn.eval() if torch.cuda.is_available(): cnn.cuda() all_outputs = [None] * len(range(0, len(files), bs)) for i in range(0, len(files), bs): t = time_logging.start() gc.collect() s, x = queue.get() t = time_logging.end("load batch", t) if torch.cuda.is_available(): x = x.cuda() t = time_logging.end("upload batch", t) outputs = model.evaluate(x) t = time_logging.end("forward", t) all_outputs[s] = outputs logger.info("Evaluation [%d.%.2d|%d/%d] Memory=%s Queue=%d", epoch, 100 * i // len(files), i, len(files), gpu_memory.format_memory(gpu_memory.used_memory()), queue.qsize()) del s del x del outputs event_done.set() return np.concatenate(all_outputs, axis=0)