class Summary(object): def __init__(self, summary_name): ''' :param summary_epoch: :param summary_name: ''' self._logPath = os.path.join("log", summary_name) self._writer = SummaryWriter(self._logPath) def addTrainLoss(self, loss, epoch): self._writer.add_scalar('train loss', loss, epoch) def addValLoss(self, loss, epoch): self._writer.add_scalar('val loss', loss, epoch) def addLearningRate(self, lr, epoch): self._writer.add_scalar('learning rate', lr, epoch) def summaryEnd(self): self._writer.export_scalars_to_json( os.path.join(self._logPath, "all_scalars.json")) self._writer.close() def addPR_label_pred(self, label, prediction): self._writer.add_pr_curve('PR_curve', label, prediction, num_thresholds=1000)
def add_pr_curve(self, tag, labels, predictions, global_step=None, num_thresholds=127, weights=None, walltime=None): if self.is_write: SummaryWriter.add_pr_curve(self, tag, labels, predictions, global_step, num_thresholds, weights, walltime)
def tb_train2(): import torchvision.utils as vutils import torchvision.models as models from torchvision import datasets resnet18 = models.resnet18(False) writer = SummaryWriter() sample_rate = 44100 freqs = [262, 294, 330, 349, 392, 440, 440, 440, 440, 440, 440] for n_iter in range(100): dummy_s1 = torch.rand(1) dummy_s2 = torch.rand(1) # data grouping by `slash` writer.add_scalar('data/scalar1', dummy_s1[0], n_iter) writer.add_scalar('data/scalar2', dummy_s2[0], n_iter) writer.add_scalars('data/scalar_group', {'xsinx': n_iter * np.sin(n_iter), 'xcosx': n_iter * np.cos(n_iter), 'arctanx': np.arctan(n_iter)}, n_iter) dummy_img = torch.rand(32, 3, 64, 64) # output from network if n_iter % 10 == 0: x = vutils.make_grid(dummy_img, normalize=True, scale_each=True) writer.add_image('Image', x, n_iter) dummy_audio = torch.zeros(sample_rate * 2) for i in range(x.size(0)): # amplitude of sound should in [-1, 1] dummy_audio[i] = np.cos(freqs[n_iter // 10] * np.pi * float(i) / float(sample_rate)) writer.add_audio('myAudio', dummy_audio, n_iter, sample_rate=sample_rate) writer.add_text('Text', 'text logged at step:' + str(n_iter), n_iter) for name, param in resnet18.named_parameters(): writer.add_histogram(name, param.clone().cpu().data.numpy(), n_iter) # needs tensorboard 0.4RC or later writer.add_pr_curve('xoxo', np.random.randint(2, size=100), np.random.rand(100), n_iter) dataset = datasets.MNIST('mnist', train=False, download=True) images = dataset.test_data[:100].float() label = dataset.test_labels[:100] features = images.view(100, 784) writer.add_embedding(features, metadata=label, label_img=images.unsqueeze(1)) # export scalar data to JSON for external processing writer.export_scalars_to_json("./all_scalars.json") writer.close()
def compute_and_plot(self, tb_writer: tfx.SummaryWriter = None, itr=0): fpr, tpr, th = roc_curve(np.array(self.gt), np.array(self.pred)) if tb_writer: for y, x in zip(tpr, fpr): tb_writer.add_scalars('Graphs', {'ROC': y * 100}, x * 100) tb_writer.flush() p, r, th = precision_recall_curve(np.array(self.gt), np.array(self.pred)) if tb_writer: tb_writer.add_pr_curve('pr_curve', np.array(self.gt), np.array(self.pred), itr) for y, x in zip(p, r): tb_writer.add_scalars('Graphs', {'PR-Curve': y * 100}, x * 100) tb_writer.flush() return p, r, th
class Experiment: def __init__(self, config): self.config = config self.load() self.save() def load(self): self.dataset = dataset_by_name(self.config.DATASET_NAME)( config=self.config) # MNISTDataset, IndicatorDataset, LoadDataset self.train_dataloader = DataLoader( self.dataset.train_dataset, batch_size=self.config.TRAIN_BATCH_SIZE, shuffle=self.config.TRAIN_SHUFFLE, drop_last=True) self.valid_dataloader = DataLoader( self.dataset.valid_dataset, batch_size=self.config.VALID_BATCH_SIZE, shuffle=self.config.VALID_SHUFFLE, drop_last=True) MODEL = class_by_name(self.config.MODEL_NAME) # CNN, LSTM self.model = MODEL(config=self.config).to(self.config.DEVICE) self.writer = SummaryWriter( log_dir=os.path.join(self.config.EXPERIMENT_DIR, 'summary')) def save(self): self.config.save() self.model.to_onnx(directory=self.config.EXPERIMENT_DIR) self.model.to_txt(directory=self.config.EXPERIMENT_DIR) def run_epoch(self, epoch): # Fit the model training_loss = self.model.fit(dataloader=self.train_dataloader).item() # Validate validation set validation_loss = self.model.validate( dataloader=self.valid_dataloader).item() # Predict images, labels = self.dataset.random_sample(n=16) prediction_logprob = self.model.predict(xs=images)[0].cpu().detach() predicted_labels = prediction_logprob.max( 1, keepdim=True)[1].numpy().flatten() # Write losses to the tensorboard self.writer.add_scalar('training_loss', training_loss, epoch) self.writer.add_scalar('validation_loss', validation_loss, epoch) # Write random image to the summary writer. image_grid = torchvision.utils.make_grid(images, normalize=True, scale_each=True) self.writer.add_image(tag="RandomSample y-{} yhat{}".format( '.'.join(map(str, labels)), '.'.join(map(str, predicted_labels))), img_tensor=image_grid, global_step=epoch) # Write PR Curve to the summary writer. self.writer.add_pr_curve('xoxo', np.random.randint(2, size=100), np.random.rand(100), epoch) # for name, param in model.named_parameters(): # print(name) # print(param) # model.writer.add_histogram(name, param.clone().cpu().data.numpy(), epoch, bins=100) # x = dict(model.named_parameters())['conv1.weight'].clone().cpu().data.numpy() # kernel1= x[0,0] # plt.imshow(kernel1) # plt.show() # needs tensorboard 0.4RC or later def run(self): epoch = 0 with trange(epoch, self.config.EPOCH_SIZE) as t: for epoch in t: self.run_epoch(epoch=epoch) self.writer.export_scalars_to_json(self.config.EXPERIMENT_DIR)
class FSLDiscriminatorAgent(BaseAgent): def __init__(self, config): super().__init__(config) print(torch.__version__) # define models self.gen_model = GenerativeFSL_CAEModel() self.model = EncoderModel(self.config) # define loss self.loss = nn.MSELoss() # nn.NLLLoss() # set cuda flag self.is_cuda = torch.cuda.is_available() if self.is_cuda and not self.config.cuda: self.logger.info( "WARNING: You have a CUDA device, so you should probably enable CUDA" ) self.cuda = self.is_cuda & self.config.cuda # set the manual seed for torch self.manual_seed = self.config.seed if self.cuda: torch.cuda.manual_seed(self.manual_seed) torch.cuda.manual_seed_all(self.manual_seed) self.device = torch.device("cuda") torch.cuda.set_device(self.config.gpu_device) self.model = self.model.to(self.device) self.loss = self.loss.to(self.device) self.logger.info("Program will run on *****GPU-CUDA***** ") print_cuda_statistics() else: self.device = torch.device("cpu") torch.manual_seed(self.manual_seed) random.seed(self.manual_seed) torch.cuda.manual_seed_all(self.manual_seed) np.random.seed(self.manual_seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False self.logger.info("Program will run on *****CPU*****\n") summary(self.model, input_size=(3, self.config.image_size, self.config.image_size)) # define optimizer self.optimizer = optim.RMSprop(self.model.parameters(), alpha=0.99, lr=self.config.learning_rate, eps=1e-08, weight_decay=0, momentum=self.config.momentum) # optim.SGD(self.model.parameters(), lr=self.config.learning_rate, momentum=self.config.momentum) # initialize counter self.current_epoch = 0 self.current_iteration = 0 self.best_metric = 0 self.best_train_loss = 999999999999 self.fixed_noise = Variable( torch.randn(self.config.batch_size, 3, self.config.image_size, self.config.image_size)) # Summary Writer self.summary_writer = SummaryWriter( log_dir=self.config.summary_dir, comment='GenerativeFSL Covid Prediction') def save_checkpoint(self, filename='discriminator_checkpoint.pth.tar', is_best=False): """ Saving the latest checkpoint of the training :param filename: filename which will contain the state :param is_best: flag is it is the best model :return: """ domain_checkpoint_file = self.config.target_domain + '_' + self.config.checkpoint_file state = { 'epoch': self.current_epoch, 'iteration': self.current_iteration, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), } encoder_state = { 'epoch': self.current_epoch, 'iteration': self.current_iteration, 'state_dict': self.model.encoder.state_dict(), 'optimizer': self.optimizer.state_dict(), } self.logger.info( "Checkpoint saving from '{}' at (epoch {}) at (iteration {})\n". format(self.config.checkpoint_dir, state['epoch'], state['iteration'])) # Save the state torch.save(state, self.config.checkpoint_dir + domain_checkpoint_file) shutil.copyfile( self.config.checkpoint_dir + domain_checkpoint_file, self.config.checkpoint_dir + str(state['epoch']) + domain_checkpoint_file) # If it is the best copy it to another file 'model_best.pth.tar' if is_best: shutil.copyfile( self.config.checkpoint_dir + domain_checkpoint_file, self.config.checkpoint_dir + 'BestModel_' + str(state['epoch']) + domain_checkpoint_file) def load_checkpoint(self, filename): filename = self.config.checkpoint_dir + filename try: self.logger.info( "******Loading checkpoint '{}' from dir {}".format( filename, self.config.checkpoint_dir)) checkpoint = torch.load(filename) self.logger.info("********Loaded checkpoint '{}'".format(filename)) self.current_epoch = checkpoint['epoch'] + 1 self.current_iteration = checkpoint['iteration'] self.model.load_state_dict(checkpoint['state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer']) self.logger.info( "Checkpoint loaded successfully from '{}' at (epoch {}) at (iteration {})\n" .format(self.config.checkpoint_dir, checkpoint['epoch'], checkpoint['iteration'])) except OSError as e: self.logger.info( "No checkpoint exists from '{}'. Skipping...".format( self.config.checkpoint_dir)) self.logger.info("**First time to train**") def set_parameter_requires_grad(self, model, feature_extract): print("############Setting grad") if feature_extract: lt = 2 cntr = 0 for child in model.children(): print("child", child) cntr += 1 if cntr < lt: for param in child.parameters(): param.requires_grad = False def load_source_model(self): try: domain_name = self.config.source_domain self.logger.info( "******Loading source model for domain '{}'".format( domain_name)) filename = os.path.join("model_repo", domain_name + "genfsl_checkpoint.pth.tar") checkpoint = torch.load(filename) self.logger.info("********Loaded checkpoint '{}'".format(filename)) self.current_epoch = 0 self.current_iteration = 0 self.model.load_state_dict(checkpoint['state_dict'], strict=False) self.logger.info( "Checkpoint loaded successfully from '{}' at (epoch {}) at (iteration {})\n" .format(self.config.checkpoint_dir, checkpoint['epoch'], checkpoint['iteration'])) self.set_parameter_requires_grad(self.model, feature_extract=True) return True except OSError as e: self.logger.info( "No model checkpoint {} exists for source domain {}. Skipping..." .format(filename, domain_name)) return False def load_model(self, domain_name): try: self.logger.info( "*Loading trained generative FSL model for domain '{}' for testing only" .format(domain_name)) filename = os.path.join("tuned_model_repo", self.config.tuned_model_name) checkpoint = torch.load(filename) self.logger.info("********Loaded checkpoint '{}'".format(filename)) self.current_epoch = 0 self.current_iteration = 0 self.model.load_state_dict(checkpoint['state_dict'], strict=False) self.logger.info( "Checkpoint loaded successfully from '{}' at (epoch {}) at (iteration {})\n" .format(self.config.checkpoint_dir, checkpoint['epoch'], checkpoint['iteration'])) return True except OSError as e: self.logger.info( "No model checkpoint exists for target domain {}. Skipping...". format(domain_name)) return False def run(self): """ This function will the operator :return: """ try: if self.config.mode == 'test': self.validate_target_domain() else: self.train_target_domain() except KeyboardInterrupt: self.logger.info("You have entered CTRL+C.. Wait to finalize") def train_target_domain(self): """ This function will the operator :return: """ domain_name = self.config.target_domain self.train_model(domain_name) def train_model(self, domain_name): self.logger.info("Fine-tuning.....Target {}, Source {} ".format( domain_name, self.config.source_domain)) try: if self.load_source_model(): self.data_loader = TargetDataLoader(config=self.config) self.train(domain_name) except KeyboardInterrupt: self.logger.info("You have entered CTRL+C.. Wait to finalize") def validate_target_domain(self): """ This function will the operator :return: """ domain_name = self.config.source_domain self.test_model(domain_name) def test_model(self, domain_name): self.logger.info("Testing.....Source {}, Target {} ".format( domain_name, self.config.target_domain)) try: if self.load_model(self.config.target_domain): self.data_loader = TargetDataLoader(config=self.config) with open(self.config.results_file_name, mode='a+') as csv_file: fieldnames = [ 'Threshold', 'Confusion_Matrix', 'Sensitivity', 'Specificity', 'F1', 'Accuracy' ] writer = csv.DictWriter(csv_file, fieldnames=fieldnames) writer.writeheader() if self.config.thresholding == False: row, _ = self.validate(0.5) writer.writerow(row) else: for threshold in np.linspace(0.1, 0.9, 9): row, _ = self.validate(threshold) writer.writerow(row) csv_file.close() except KeyboardInterrupt: self.logger.info("You have entered CTRL+C.. Wait to finalize") def train(self, domain_name): """ Main training function, with per-epoch model saving """ summary(self.model, input_size=(3, self.config.image_size, self.config.image_size)) # weight=class_weights) # MSELoss()#BCE_KLDLoss(self.model) self.criterion = nn.CrossEntropyLoss() self.logger.info( "Training linear layers for generative FSL in {} domain".format( domain_name)) # Model Loading from the latest checkpoint if not found start from scratch. domain_checkpoint_file = domain_name + self.config.checkpoint_file self.logger.info( "LOADING {}....................".format(domain_checkpoint_file)) self.load_checkpoint(domain_checkpoint_file) for epoch in range(self.current_epoch, self.current_epoch + self.config.max_epoch): self.current_epoch = epoch train_loss = self.train_one_epoch(domain_name) #_,valid_loss = self.validate() is_best = train_loss < self.best_train_loss if is_best: self.best_train_loss = train_loss self.save_checkpoint(is_best=is_best) def train_one_epoch(self, domain_name): """ One epoch of training :return: """ self.model.train() epoch_lossD = AverageMeter() for batch_idx, data in enumerate(self.data_loader.train_loader): # credit assignment self.optimizer.zero_grad() # clear the gardients imgs, labels = data imgs = imgs.to(self.device) predicted_labels = self.model(imgs) loss = self.criterion(predicted_labels, labels) loss.backward() # update model weights self.optimizer.step() epoch_lossD.update(loss.item()) self.logger.info(batch_idx) if batch_idx % self.config.log_interval == 0: self.logger.info( 'Last Layers Training Epoch: {} [{}/{} ({:.0f}%)] Loss: {:6f}' .format( self.current_epoch, batch_idx * self.config.batch_size, len(self.data_loader.train_loader.dataset), 100. * (batch_idx * self.config.batch_size / len(self.data_loader.train_loader.dataset)), loss.item())) self.current_iteration += 1 self.summary_writer.add_scalar( "epoch/Training_Loss_" + domain_name, epoch_lossD.val, self.current_iteration) # self.visualize_one_epoch() self.logger.info("Training linear layers at epoch-" + str(self.current_epoch) + " | " + " - Training Loss-: " + str(epoch_lossD.val)) return epoch_lossD.val def visualize_one_epoch(self): """ One epoch of visualizing :return: """ self.model.eval() test_loss = 0 with torch.no_grad(): for batch_idx, data in enumerate(self.data_loader.test_loader): testimgs, predicted_labels = data # data.to(self.device testimgs = testimgs.to(self.device) predicted_labels = self.model(testimgs) #generated_testimgs = generated_testimgs[0] # make_dot(generated_img[0]) print(list(predicted_labels.size())) # print(list(testimgs.size())) # plt.figure() #img = testimgs[batch_idx] # img = generated_testimgs #.reshape((generated_testimgs.size()[0], 3,224,224)) # print(list(img.size())) #img = img.permute(0,3,1,2) # print(list(img.size())) # self.data_loader.plot_samples_per_epoch_with_labels(img,self.current_epoch,labels=predicted_labels) # plt.imshow(img.numpy()) def add_pr_curve_tensorboard(self, class_index, test_probs, test_preds, global_step=0): ''' Takes in a "class_index" from 0 to 9 and plots the corresponding precision-recall curve ''' tensorboard_preds = test_preds == class_index tensorboard_probs = test_probs[:, class_index] self.summary_writer.add_pr_curve('PR for Covid prediction', tensorboard_preds, tensorboard_probs, global_step=global_step) def validate(self, threshold=0.5): """ One cycle of model validation :return: """ self.criterion = nn.CrossEntropyLoss() self.model.eval() test_loss = 0 correct = 0 y_true = [] y_pred = [] with torch.no_grad(): for batch_idx, data in enumerate(self.data_loader.test_loader): images, labels = data # .to(self.device) labels_list = [element.item() for element in labels.flatten()] y_true_batch = labels_list output = self.model(images) # [B,2] #print("Batch idx{} and size{}".format(batch_idx,len(labels_list))) #print(output) # converting the output layer values into labels 0 or one based on threshold sm = torch.nn.Softmax(1) # constrained probabilitites output = sm(output) #print("after softmax",output) #thresh = torch.nn.Threshold(threshold,0,False) thresholded_output = output > threshold #thresh(output) y_pred_batch = [] #print("after thresholding",thresholded_output) output_max_value = torch.max(thresholded_output, 1) #print("gadbad",output_max_value) y_pred_batch = output_max_value[1] #print("matching in batch", len([y_pred_batch == y_true])) #print("Sample pred", y_pred_batch[0], len(y_pred_batch)) y_true.extend(y_true_batch) y_pred.extend(y_pred_batch) #if batch_idx == 0 : # break #print(len(y_true),"%%%%",len(y_pred)) print("Threshold", threshold) tn, fp, fn, tp = sklearn.metrics.confusion_matrix(y_true, y_pred).ravel() cf = sklearn.metrics.confusion_matrix(y_true, y_pred) print("CF", cf) print("confusion matrix ", tn, fp, fn, tp) sensitivity = tp / (tp + fn) specificity = tn / (tn + fp) p = sklearn.metrics.precision_score(y_true, y_pred) print("PRECISION", p) print("computed PRECISION", tp / (tp + fp)) r = sklearn.metrics.recall_score(y_true, y_pred) print("recall", r) print("computed recall", tp / (tp + fn)) f1 = sklearn.metrics.f1_score(y_true, y_pred, average="binary") print("F1", f1) acc = sklearn.metrics.accuracy_score(y_true, y_pred) print("acc", acc) fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_true, y_pred, pos_label=2) #print("fpr {},tpr {}, thresholds {}".format(fpr,tpr,thresholds)) print("sensitivity {},specificity {}".format(sensitivity, specificity)) #print("auc for covid class ", sklearn.metrics.auc(fpr, tpr)) #test_loss /= len(self.data_loader.test_loader.dataset) #self.logger.info('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( # test_loss, correct, len(self.data_loader.test_loader.dataset), # 100. * acc)) #fieldnames = ['threhhold', 'Sensitivity', 'Specificity', 'F1', 'Accuracy'] results = { 'Threshold': threshold, 'Confusion_Matrix': cf, 'Sensitivity': sensitivity, 'Specificity': specificity, 'F1': f1, 'Accuracy': acc } #"" + str(threshold) +"," + str(r) +"," + str(p) +"," + str(f1) +"," + str(acc) return results, test_loss def finalize(self): """ Finalizes all the operations of the 2 Main classes of the process, the operator and the data loader :return: """ self.logger.info( "Please wait while finalizing the operation.. Thank you") #self.save_checkpoint() self.summary_writer.export_scalars_to_json("{}all_scalars.json".format( self.config.summary_dir)) self.summary_writer.close()
prec[:, j], recall[:, j] = (tp + 1e-10) / ( y_temp.sum(dim=-1).sum(dim=-1) + 1e-10), (tp + 1e-10) / ( mask.sum(dim=-1).sum(dim=-1) + 1e-10) # (batch, threshold) precs.append(prec) recalls.append(recall) prec = torch.cat(precs, dim=0).mean(dim=0) recall = torch.cat(recalls, dim=0).mean(dim=0) f_score = (1 + beta_square) * prec * recall / (beta_square * prec + recall) thlist = torch.linspace(0, 1 - 1e-10, 256) print("Max F_score :", torch.max(f_score)) print("Max_F_threshold :", thlist[torch.argmax(f_score)]) if args.logdir is not None: writer.add_scalar("Max F_score", torch.max(f_score), global_step=model_iter) writer.add_scalar("Max_F_threshold", thlist[torch.argmax(f_score)], global_step=model_iter) pred = torch.cat(preds, 0) mask = torch.cat(masks, 0).round().float() if args.logdir is not None: writer.add_pr_curve('PR_curve', mask, pred, global_step=model_iter) writer.add_scalar('MAE', torch.mean(torch.abs(pred - mask)), global_step=model_iter) print("MAE :", torch.mean(torch.abs(pred - mask))) # Measure method from https://github.com/AceCoooool/DSS-pytorch solver.py
class Summarizer(object): def __init__(self): self.report = False self.global_step = None self.writer = None def initialize_writer(self, log_dir): self.writer = SummaryWriter(log_dir) def add_scalar(self, tag, scalar_value, global_step=None, walltime=None): if not self.report: return if global_step is None and self.global_step is not None: global_step = self.global_step self.writer.add_scalar(tag, scalar_value, global_step=global_step, walltime=walltime) def add_scalars(self, main_tag, tag_scalar_dict, global_step=None, walltime=None): if not self.report: return if global_step is None and self.global_step is not None: global_step = self.global_step self.writer.add_scalars(self, main_tag, tag_scalar_dict, global_step=global_step, walltime=walltime) def add_histogram(self, tag, values, global_step=None, bins='tensorflow', walltime=None): if not self.report: return if global_step is None and self.global_step is not None: global_step = self.global_step if isinstance(values, chainer.cuda.cupy.ndarray): values = chainer.cuda.to_cpu(values) self.writer.add_histogram(tag, values, global_step=global_step, bins=bins, walltime=walltime) def add_image(self, tag, img_tensor, global_step=None, walltime=None): if not self.report: return if global_step is None and self.global_step is not None: global_step = self.global_step self.writer.add_image(tag, img_tensor, global_step=global_step, walltime=walltime) def add_image_with_boxes(self, tag, img_tensor, box_tensor, global_step=None, walltime=None, **kwargs): if not self.report: return if global_step is None and self.global_step is not None: global_step = self.global_step self.writer.add_image_with_boxes(tag, img_tensor, box_tensor, global_step=global_step, walltime=walltime, **kwargs) def add_figure(self, tag, figure, global_step=None, close=True, walltime=None): if not self.report: return if global_step is None and self.global_step is not None: global_step = self.global_step self.writer.add_figure(tag, figure, global_step=global_step, close=close, walltime=walltime) def add_video(self, tag, vid_tensor, global_step=None, fps=4, walltime=None): if not self.report: return if global_step is None and self.global_step is not None: global_step = self.global_step self.writer.add_video(tag, vid_tensor, global_step=global_step, fps=fps, walltime=walltime) def add_audio(self, tag, snd_tensor, global_step=None, sample_rate=44100, walltime=None): if not self.report: return if global_step is None and self.global_step is not None: global_step = self.global_step self.writer.add_audio(tag, snd_tensor, global_step=global_step, sample_rate=sample_rate, walltime=walltime) def add_text(self, tag, text_string, global_step=None, walltime=None): if not self.report: return if global_step is None and self.global_step is not None: global_step = self.global_step self.writer.add_text(tag, text_string, global_step=global_step, walltime=walltime) def add_graph_onnx(self, prototxt): if not self.report: return self.writer.add_graph_onnx(self, prototxt) def add_graph(self, model, input_to_model=None, verbose=False, **kwargs): if not self.report: return self.writer.add_graph(model, input_to_model=input_to_model, verbose=verbose, **kwargs) def add_embedding(self, mat, metadata=None, label_img=None, global_step=None, tag='default', metadata_header=None): if not self.report: return if global_step is None and self.global_step is not None: global_step = self.global_step self.writer.add_embedding(mat, metadata=metadata, label_img=label_img, global_step=global_step, tag=tag, metadata_header=metadata_header) def add_pr_curve(self, tag, labels, predictions, global_step=None, num_thresholds=127, weights=None, walltime=None): if not self.report: return if global_step is None and self.global_step is not None: global_step = self.global_step self.writer.add_pr_curve(tag, labels, predictions, global_step=global_step, num_thresholds=num_thresholds, weights=weights, walltime=walltime) def add_pr_curve_raw(self, tag, true_positive_counts, false_positive_counts, true_negative_counts, false_negative_counts, precision, recall, global_step=None, num_thresholds=127, weights=None, walltime=None): if not self.report: return if global_step is None and self.global_step is not None: global_step = self.global_step self.writer.add_pr_curve_raw(tag, true_positive_counts, false_positive_counts, true_negative_counts, false_negative_counts, precision, recall, global_step=global_step, num_thresholds=num_thresholds, weights=weights, walltime=walltime) def add_custom_scalars_multilinechart(self, tags, category='default', title='untitled'): if not self.report: return self.writer.add_custom_scalars_multilinechart(tags, category=category, title=title) def add_custom_scalars_marginchart(self, tags, category='default', title='untitled'): if not self.report: return self.writer.add_custom_scalars_marginchart(tags, category=category, title=title) def add_custom_scalars(self, layout): if not self.report: return self.writer.add_custom_scalars(layout)
dummy_img = torch.rand(32, 3, 64, 64) # output from network if n_iter % 10 == 0: x = vutils.make_grid(dummy_img, normalize=True, scale_each=True) writer.add_image('Image', x, n_iter) dummy_audio = torch.zeros(sample_rate * 2) for i in range(x.size(0)): # amplitude of sound should in [-1, 1] dummy_audio[i] = np.cos(freqs[n_iter // 10] * np.pi * float(i) / float(sample_rate)) writer.add_audio('myAudio', dummy_audio, n_iter, sample_rate=sample_rate) writer.add_text('Text', 'text logged at step:' + str(n_iter), n_iter) for name, param in resnet18.named_parameters(): writer.add_histogram(name, param.clone().cpu().data.numpy(), n_iter) # needs tensorboard 0.4RC or later writer.add_pr_curve('xoxo', np.random.randint(2, size=100), np.random.rand(100), n_iter) dataset = datasets.MNIST('mnist', train=False, download=True) images = dataset.test_data[:100].float() label = dataset.test_labels[:100] features = images.view(100, 784) writer.add_embedding(features, metadata=label, label_img=images.unsqueeze(1)) # export scalar data to JSON for external processing writer.export_scalars_to_json("./all_scalars.json") writer.close()
def launch_train(config, model, path_model, path_data_train, path_data_dev, nb_epoch=5, device='cpu', type_sentence_embedding='lstm', restart_at_epoch=0): #https://gist.github.com/Tushar-N/dfca335e370a2bc3bc79876e6270099e check_dev_epoch = 1 writer = SummaryWriter(comment='1 couche') '''with open(path_data_train+'inputs_embeddings.pickle', 'rb') as handle: inputs_embeddings_train = pickle.load(handle) with open(path_data_train+'outputs_refs.pickle', 'rb') as handle: outputs_refs_train = pickle.load(handle) with open(path_data_dev+'inputs_embeddings.pickle', 'rb') as handle: inputs_embeddings_dev = pickle.load(handle) with open(path_data_dev+'outputs_refs.pickle', 'rb') as handle: outputs_refs_dev = pickle.load(handle)''' #model = torch.nn.DataParallel(model, dim=dim)#, device_ids=[0, 1, 2]) #pos_weight = torch.FloatTensor(len(negatives)/len(positives)) #pos_weight = pos_weight.to(device) criterion = nn.BCELoss( ) #nn.BCEWithLogitsLoss(pos_weight=None)#pos_weight)#BCELoss()#NLLLoss() #optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, nesterov=True) optimizer = optim.Adam(model.parameters(), lr=0.001) #scheduler = StepLR(optimizer, step_size=math.ceil(nb_epoch/5), gamma=0.2) #scheduler = ReduceLROnPlateau(optimizer, 'min') model = model.train() # Launch training print('début train') losses = [] losses_dev = [] #ids_iter=list(range(len(inputs_embeddings_train))) nb_files = len( list(glob.glob(path_data_train + 'inputs_embeddings_*.pickle'))) for epoch in range(restart_at_epoch, nb_epoch): print('epoch', epoch + 1, 'on', nb_epoch) #shuffle(ids_iter) losses_ = [] #scheduler.step(epoch) #TEST #for id_, it_ in enumerate(iter_): Y_pred = [] Y_ref = [] for it_ in tqdm(range(nb_files)): #for it_ in range(nb_files): #print(it_+1,'on',nb_files,'epoch',epoch+1,'on',nb_epoch) sentences_embs = torch.load(path_data_train + 'inputs_embeddings_' + str(it_) + '.pickle') X_lengths = torch.load(path_data_train + 'X_lengths_' + str(it_) + '.pickle') refs = torch.load(path_data_train + 'outputs_refs_' + str(it_) + '.pickle') '''with open(path_data_train+'inputs_embeddings_'+str(it_)+'.pickle', 'rb') as handle: sentences_emb = pickle.load(handle) with open(path_data_train+'outputs_refs_'+str(it_)+'.pickle', 'rb') as handle: ref = pickle.load(handle)''' for sentences_emb, X_length, ref in zip( sentences_embs, X_lengths, refs ): #Each file contains all the tensor of window-size for one episode #sentences_emb = inputs_embeddings_train[it_] #(8,32,4096) #ref = outputs_refs_train[it_] #(1,32) if sentences_emb.shape[ 0] == 0: #TODO il y a des tensors vide, par exemple le 142ème en partant de 0 #print('tensor empty wtf') continue sentences_emb = sentences_emb.to(device) X_length = X_length.to(device) ref = ref.to(device) #torch.Size([36, 32, 300]) torch.Size([1, 31]) #print(sentences_emb.size(), ref.size()) #torch.Size([34, 32, 300]) torch.Size([1, 31]) # Step 1. Remember that Pytorch accumulates gradients. # We need to clear them out before each instance #print(i, nb_sentences) #model.zero_grad() # zero the parameter gradients optimizer.zero_grad() # Step 2. Get our inputs ready for the network, that is, turn them into # Tensors of word indices. # Also, we need to clear out the hidden state of the LSTM, # detaching it from its history on the last instance. if type_sentence_embedding == 'lstm': model.hidden_sentences = model.init_hidden( batch_size=sentences_emb.shape[1] ) #(L,B,D) -> (109,8..,300) #model.hidden = model.init_hidden(batch_size=int(sentences_emb.shape[1]/model.taille_context)) # Step 3. Run our forward pass. #print('sentences_emb word embeddings',sentences_emb) sentences_emb_ = model.forward_sentence( sentences_emb, X_length) #(32,300) #print('sentences_emb_',sentences_emb_.shape) #print('sentences_emb_',sentences_emb_) to_packed_X = [] to_packed_Y = [] ref = ref.squeeze(0) #print(sentences_emb_.shape[0], model.taille_context, sentences_emb_.shape[0] - model.taille_context, ref.size()) for i in range(sentences_emb_.shape[0] - model.taille_context + 1): to_packed_X.append( torch.index_select( sentences_emb_, 0, torch.tensor(list( range(i, i + model.taille_context)), device=device))) to_packed_Y.append( torch.index_select( ref, 0, torch.tensor( [i + (int(model.taille_context / 2) - 1)], device=device))) sentences_emb = torch.stack(to_packed_X).transpose( 0, 1) #(n,8,300) -> (8,n,300) sentences_emb = sentences_emb.to(device) ref = torch.stack(to_packed_Y).transpose( 0, 1) #(n,1) -> (1,n) #torch.Size([8, 25, 300]) torch.Size([1, 25]) #print(sentences_emb.size(), ref.size()) model.hidden = model.init_hidden( batch_size=sentences_emb.shape[1]) # Step 3. Run our forward pass. #print('sentences_emb',sentences_emb.shape) #print('sentences_emb',sentences_emb) prediction = model( sentences_emb) #(32,1) #(1,32,4096) or (109,8*32..,300) #WTFFFF torch.Size([25, 1]) torch.Size([1, 25]) torch.Size([8, 25, 300]) #print('WTFFFF', prediction.size(), ref.size(), sentences_emb.size()) #torch.Size([4, 1]) torch.Size([1, 32]) torch.Size([34, 32, 300]) prediction = torch.squeeze(prediction, 1) ref = torch.squeeze(ref, 0) #tensor([0.6659, 0.6659, 0.6659, 0.6659, 0.6659, 0.6659, 0.6659, 0.6659, 0.6659, 0.6659, 0.6659, 0.6659, 0.6659, 0.6659, 0.6659, 0.6659, 0.6659, 0.6659, 0.6659, 0.6659, 0.6659, 0.6659, 0.6659, 0.6659, 0.6659], device='cuda:1', grad_fn=<SqueezeBackward1>) tensor([1., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 1., 1., 1., 1., 0., 0., 1., 1.], device='cuda:1') #print(prediction, ref) #print(prediction.shape, ref.shape) # Step 4. Compute the loss, gradients, and update the parameters by # calling optimizer.step() #print(prediction.size(), ref.size(), sentences_emb.size()) #torch.Size([4]) torch.Size([356, 1]) torch.Size([34, 32, 300]) loss = criterion(prediction, ref) #targets) losses_.append(loss.item()) loss.backward() optimizer.step() # To calculate the EER Y_pred.append(np.asarray(prediction.detach().to('cpu'))) Y_ref.append(np.asarray(ref.to('cpu'))) #break '''# Calculate the EER model_eval = model.eval() fpr, tpr, threshold = roc_curve(Y_ref, Y_pred, pos_label=1) fnr = 1 - tpr eer_threshold = threshold(np.nanargmin(np.absolute((fnr - fpr)))) eer = fpr(np.nanargmin(np.absolute((fnr - fpr))))''' #print(np.mean(np.concatenate(Y_ref, axis=None)), np.mean(np.concatenate(Y_pred, axis=None))) Y_ref = np.concatenate(Y_ref, axis=None) Y_pred = np.concatenate(Y_pred, axis=None) writer.add_pr_curve('score_train', np.mean(Y_ref), np.mean(Y_pred), epoch) mean_loss_train = np.mean(np.asarray(losses_)) mean_loss_per_epoch = mean_loss_train #np.mean(np.asarray(losses_)) #print('Sum/len losses', sum(losses_)/len(losses_)) print('Mean loss per epoch train', mean_loss_per_epoch) losses.append(mean_loss_per_epoch) #model.get_prediction(X_, Y_, idx_set_words, embed, model, taille_context=taille_context, device=device) #break #TEST gagne du temps en ne sauvegardant pas les modèles torch.save(model.state_dict(), path_model + 'models/model_' + str(epoch) + '.pth.tar') #break if epoch % check_dev_epoch == 0: #We evaluate on dev set for name, param in model.named_parameters(): writer.add_histogram(name, param.clone().cpu().data.numpy(), epoch) # Calculate the EER #model_eval = model.eval() #TODO #Y_ref = np.concatenate(Y_ref, axis=None) #Y_pred = np.concatenate(Y_pred, axis=None) fpr, tpr, threshold = roc_curve(Y_ref, Y_pred, pos_label=1) fnr = 1 - tpr eer_threshold_train = threshold[np.nanargmin( np.absolute((fnr - fpr)))] eer_train = fpr[np.nanargmin(np.absolute((fnr - fpr)))] #ids_iter=list(range(len(inputs_embeddings_train))) losses_dev_ = [] Y_pred = [] Y_ref = [] best_loss_dev = None id_best_loss_dev = 0 aa = True for it_ in range( len( list( glob.glob(path_data_dev + 'inputs_embeddings_*.pickle')))): sentences_emb = torch.load(path_data_dev + 'inputs_embeddings_' + str(it_) + '.pickle') X_lengths = torch.load(path_data_dev + 'X_lengths_' + str(it_) + '.pickle') ref = torch.load(path_data_dev + 'outputs_refs_' + str(it_) + '.pickle') '''with open(path_data_dev+'inputs_embeddings_'+str(it_)+'.pickle', 'rb') as handle: sentences_emb = pickle.load(handle) with open(path_data_dev+'outputs_refs_'+str(it_)+'.pickle', 'rb') as handle: ref = pickle.load(handle)''' for sentences_emb, ref in zip( sentences_embs, refs ): #Each file contains all the tensor of window-size for one episode #print(it_,'on',len(ids_iter),' dev') #sentences_emb = inputs_embeddings_dev[it_] #(8,32,4096) #ref = outputs_refs_dev[it_] #(1,32) if sentences_emb.shape[ 0] == 0: #TODO il y a des tensors vide, par exemple le 142ème en partant de 0 print('tensor empty wtf') continue sentences_emb = sentences_emb.to(device) X_length = X_length.to(device) ref = ref.to(device) if type_sentence_embedding == 'lstm': model.hidden_sentences = model.init_hidden( batch_size=sentences_emb.shape[1] ) #(L,B,D) -> (109,8..,300) #model.hidden = model.init_hidden(batch_size=int(sentences_emb.shape[1]/model.taille_context)) # Step 3. Run our forward pass. sentences_emb_ = model.forward_sentence( sentences_emb, X_length) #(32,300) to_packed_X = [] to_packed_Y = [] ref = ref.squeeze(0) #print(sentences_emb_.shape[0], model.taille_context, sentences_emb_.shape[0] - model.taille_context, ref.size()) for i in range(sentences_emb_.shape[0] - model.taille_context + 1): to_packed_X.append( torch.index_select( sentences_emb_, 0, torch.tensor(list( range(i, i + model.taille_context)), device=device))) to_packed_Y.append( torch.index_select( ref, 0, torch.tensor([ i + (int(model.taille_context / 2) - 1) ], device=device))) sentences_emb = torch.stack(to_packed_X).transpose( 0, 1) #(n,8,300) -> (8,n,300) sentences_emb = sentences_emb.to(device) ref = torch.stack(to_packed_Y).transpose( 0, 1) #(n,1) -> (1,n) model.hidden = model.init_hidden( batch_size=sentences_emb.shape[1]) prediction = model( sentences_emb ) #(32,1) #(1,32,4096) or (109,8*32,300) ? prediction = torch.squeeze(prediction, 1) ref = torch.squeeze(ref, 0) if aa: print(prediction, ref) aa = False loss = criterion(prediction, ref) #targets) losses_dev_.append(loss.item()) if not best_loss_dev or loss < best_loss_dev: torch.save( model.state_dict(), path_model + 'model_best_' + str(epoch) + '.pth.tar') best_loss_dev = loss id_best_loss_dev = epoch Y_pred.append(np.asarray(prediction.detach().to('cpu'))) Y_ref.append(np.asarray(ref.to('cpu'))) # Calculate the EER Y_ref = np.concatenate(Y_ref, axis=None) Y_pred = np.concatenate(Y_pred, axis=None) fpr, tpr, threshold = roc_curve(Y_ref, Y_pred, pos_label=1) fnr = 1 - tpr eer_threshold_dev = threshold[np.nanargmin(np.absolute( (fnr - fpr)))] eer_dev = fpr[np.nanargmin(np.absolute((fnr - fpr)))] plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold (AUC = %0.2f)' % (auc(fpr, tpr))) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic curve') plt.legend(loc="lower right") #plt.show() plt.savefig(config['path_work'] + 'roc_curve_' + str(epoch) + '.pdf') plt.close() mean_loss_dev = np.mean(np.asarray(losses_dev_)) print('Mean loss on dev', mean_loss_dev) print('EER threshold, fpr (train/dev)', eer_threshold_train, eer_train, eer_threshold_dev, eer_dev) print('id_best_loss_dev', id_best_loss_dev) losses_dev.append(mean_loss_dev) #scheduler.step(mean_loss_dev) writer.add_scalars( 'data/scalar_group', { 'loss_train': mean_loss_train, 'loss_dev': mean_loss_dev, 'score_train': eer_train, 'score_dev': eer_dev }, epoch) writer.add_pr_curve('score_dev', np.mean(Y_ref), np.mean(Y_pred), epoch) #writer.add_pr_curve('roc_dev', np.mean(Y_ref), np.mean(Y_pred), epoch) writer.close() print('Best model on epoch', id_best_loss_dev) #Aller chercher à main et copier le bon modèle #torch.save(model.state_dict(), path_model+'model_best_'+str(id_best_loss_dev)+'.pth.tar') print('fin train') return losses
class TensorboardXLogger(NumpySeabornPlotLogger): """Logger that uses tensorboardX to log to Tensorboard.""" def __init__(self, target_dir, *args, **kwargs): super(TensorboardXLogger, self).__init__(*args, **kwargs) os.makedirs(target_dir, exist_ok=True) self.writer = SummaryWriter(target_dir) self.val_dict = defaultdict(int) atexit.register(self.writer.close) def show_image(self, image, name="Image", counter=None, **kwargs): """ Sends an image to tensorboard. Args: image (np.narray/torch.tensor): Image array/tensor which will be sent name (str): Identifier for the image counter (int): Global step value """ if counter is not None: self.val_dict["{}-image".format(name)] = counter else: self.val_dict["{}-image".format(name)] += 1 self.writer.add_image( name, image, global_step=self.val_dict["{}-image".format(name)]) def show_images(self, images, name="Images", counter=None, **kwargs): """ Sends multiple images to tensorboard. Args: image (np.narray/torch.tensor): Image array/tensor which will be sent (NxCxHxW) name (str): Identifier for the images counter (int): Global step value """ if counter is not None: self.val_dict["{}-image".format(name)] = counter else: self.val_dict["{}-image".format(name)] += 1 self.writer.add_images( name, images, global_step=self.val_dict["{}-image".format(name)]) @convert_params def show_value(self, value, name="Value", counter=None, tag=None, **kwargs): """ Sends a scalar value to tensorboard. Args: value (numeric): Value to be sent name (str): Identifier for the value counter (int): Global step value tag (str): Identifier for the frame (values with the same tag will be shown in the same graph) """ if tag is None: key = name + "-" + name else: key = tag + "-" + name if counter is not None: self.val_dict["{}-image".format(key)] = counter else: self.val_dict["{}-image".format(key)] += 1 if tag is not None: self.writer.add_scalars( tag, {name: value}, global_step=self.val_dict["{}-image".format(key)]) self.writer.scalar_dict = {} else: self.writer.add_scalar( name, value, global_step=self.val_dict["{}-image".format(key)]) def show_text(self, text, name="Text", counter=None, **kwargs): """ Sends text to tensorboard. Args: text (str): Text to be sent name (str): Identifier for the text counter (int): Global step value """ if counter is not None: self.val_dict["{}-text".format(name)] = counter else: self.val_dict["{}-text".format(name)] += 1 self.writer.add_text(name, text, global_step=self.val_dict["{}-text".format(name)]) @convert_params def show_image_grid(self, image_array, name="Image-Grid", counter=None, nrow=8, padding=2, normalize=False, range=None, scale_each=False, pad_value=0, *args, **kwargs): """ Sends an array of images to tensorboard as a grid. Like :meth:`.show_image`, but generates image grid before. Args: image_array (np.narray/torch.tensor): Image array/tensor which will be sent as an image grid name (str): Identifier for the image grid counter (int): Global step value nrow (int): Items per row in grid padding (int): Padding between images in grid normalize (bool): Normalize images in grid range (tuple): Tuple (min, max), so images will be normalized to this range scale_each (bool): If True, each image will be normalized separately instead of using min and max of whole tensor pad_value (float): Fill padding with this value """ image_args = dict(nrow=nrow, padding=padding, normalize=normalize, range=range, scale_each=scale_each, pad_value=pad_value) if counter is not None: self.val_dict["{}-image".format(name)] = counter else: self.val_dict["{}-image".format(name)] += 1 grid = np_make_grid(image_array, **image_args) self.writer.add_image( tag=name, img_tensor=grid, global_step=self.val_dict["{}-image".format(name)]) self.val_dict[name] += 1 @convert_params def show_barplot(self, array, name="barplot", counter=None, *args, **kwargs): """ Sends a barplot to tensorboard. Args: array (np.array/torch.tensor): array of shape NxM where N is the number of rows and M is the number of elements in the row. name (str): The name of the figure counter (int): Global step value to record """ if counter is not None: self.val_dict["{}-figure".format(name)] = counter else: self.val_dict["{}-figure".format(name)] += 1 figure = super().show_barplot(array, name, *args, **kwargs) self.writer.add_figure( tag=name, figure=figure, global_step=self.val_dict["{}-figure".format(name)]) @convert_params def show_lineplot(self, y_vals, x_vals=None, name="lineplot", counter=None, *args, **kwargs): """ Sends a lineplot to tensorboard. Args: y_vals (np.array/torch.tensor): Array of shape MxN , where M is the number of points and N is the number of different line x_vals (np.array/torch.tensor): Has to have the same shape as Y: MxN. For each point in Y it gives the corresponding X value (if not set the points are assumed to be equally distributed in the interval [0, 1]) name (str): The name of the figure counter (int): Global step value to record """ if counter is not None: self.val_dict["{}-figure".format(name)] = counter else: self.val_dict["{}-figure".format(name)] += 1 figure = super().show_lineplot(y_vals, x_vals, name, *args, **kwargs) self.writer.add_figure( tag=name, figure=figure, global_step=self.val_dict["{}-figure".format(name)]) @convert_params def show_scatterplot(self, array, name="scatterplot", counter=None, *args, **kwargs): """ Sends a scatterplot to tensorboard. Args: array (np.array/torch.tensor): An array with size N x dim, where each element i \in N` at X[i] results in a 2D (if dim = 2) or 3D (if dim = 3) point. name (str): The name of the figure counter (int): Global step value to record """ if counter is not None: self.val_dict["{}-figure".format(name)] = counter else: self.val_dict["{}-figure".format(name)] += 1 figure = super().show_scatterplot(array, name, *args, **kwargs) self.writer.add_figure( tag=name, figure=figure, global_step=self.val_dict["{}-figure".format(name)]) @convert_params def show_piechart(self, array, name="piechart", counter=None, *args, **kwargs): """ Sends a piechart tensorboard. Args: array (np.array/torch.tensor): Array of positive integers. Each integer will be presented as a part of the pie (with the total as the sum of all integers) name (str): The name of the figure counter (int): Global step value to record """ if counter is not None: self.val_dict["{}-figure".format(name)] = counter else: self.val_dict["{}-figure".format(name)] += 1 figure = super().show_piechart(array, name, *args, **kwargs) self.writer.add_figure( tag=name, figure=figure, global_step=self.val_dict["{}-figure".format(name)]) def show_embedding(self, tensor, labels=None, name='default', label_img=None, counter=None, *args, **kwargs): """ Displays an embedding of a tensor (for more details see tensorboardX) Args: tensor (torch.tensor/np.array): Tensor to be embedded and then displayed labels (list): List of labels, each element will be converted to string name (str): The name for the embedding label_img (torch.tensor): Images to be displayed at the embedding points counter (int): Global step value to record """ if counter is not None: self.val_dict["{}-embedding".format(name)] = counter else: self.val_dict["{}-embedding".format(name)] += 1 self.writer.add_embedding( mat=tensor, metadata=labels, label_img=label_img, tag=name, global_step=self.val_dict["{}-embedding".format(name)]) def show_histogram(self, array, name="Histogram", counter=None, *args, **kwargs): """ Plots a histogram in the tensorboard histrogram plugin Args: array (torch.tensor/np.array): Values to build histogram name (str): Data identifier counter (int): Global step value to record """ if counter is not None: self.val_dict["{}-histogram".format(name)] = counter else: self.val_dict["{}-histogram".format(name)] += 1 self.writer.add_histogram( tag=name, values=array, global_step=self.val_dict["{}-histogram".format(name)]) def show_pr_curve(self, tensor, labels, name="pr-curve", counter=None, *args, **kwargs): """ Displays a precision recall curve given a tensor with scores and the corresponding labels Args: tensor (torch.tensor/np.array): Tensor with scores (e.g class probabilities) labels (list): Labels of the samples to which the scores match name (str): The name of the plot counter (int): Global step value """ if counter is not None: self.val_dict["{}-pr-curve".format(name)] = counter else: self.val_dict["{}-pr-curve".format(name)] += 1 self.writer.add_pr_curve( tag=name, labels=labels, predictions=tensor, global_step=self.val_dict["{}-pr-curve".format(name)]) def close(self): self.writer.close()
# region Final report pd.options.display.precision = 2 pd.options.display.max_columns = 999 pd.options.display.expand_frame_repr = False nodes_df = pd.DataFrame({k: np.concatenate(v) for k, v in nodes_df.items()}) experiment.average_precision = sklearn.metrics.average_precision_score( y_true=nodes_df.Targets, y_score=nodes_df.Results) print('Average precision:', experiment.average_precision) if logger is not None: logger.add_scalar('metrics/val/avg_precision', experiment.average_precision, global_step=experiment.samples) logger.add_pr_curve('infection', labels=nodes_df.Targets.values, predictions=nodes_df.Results.values, global_step=experiment.samples) # noinspection PyUnreachableCode if False: import matplotlib.pyplot as plt precision, recall, _ = sklearn.metrics.precision_recall_curve( y_true=nodes_df.Targets, probas_pred=nodes_df.Results) plt.step(recall, precision, color='b', alpha=0.2, where='post') plt.fill_between(recall, precision, alpha=0.2, color='b', step='post') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.title(f'Precision-Recall curve: AP={experiment.average_precision:.2f}')
img = batch['image'].to(device) mask = batch['mask'].to(device) with torch.no_grad(): pred, loss = model(img, mask) pred = pred[5].data mse += F.mse_loss(pred, mask) pred = pred.requires_grad_(False) preds.append(pred) masks.append(mask) if not i < 100: break pred = torch.stack(preds, 0) mask = torch.stack(masks, 0) writer.add_pr_curve('PR_curve', mask, pred, global_step=int( model_name.split('epo_')[1].split('step')[0])) writer.add_scalar('MAE', F.mse_loss(pred, mask), global_step=int( model_name.split('epo_')[1].split('step')[0])) prediction = pred.data.cpu().numpy().flatten() target = mask.data.round().cpu().numpy().flatten() # print(type(prediction)) precision, recall, threshold = precision_recall_curve( target, prediction) f_score = (1 + beta_square) * precision * recall / ( beta_square * precision + recall) writer.add_scalar("Max F_score", np.max(f_score),
def main(): ''' --- SELECT DEVICES --- ''' # Select either gpu or cpu device = torch.device("cuda" if args.cuda else "cpu") # Select among available GPUs if args.cuda: os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(str(x) for x in args.gpudevice) ''' --- CREATE EXPERIMENTS DIRECTORY AND LOGGERS IN TENSORBOARD --- ''' projdir = sys.path[0] # Path for saving and loading the network. saveloadpath = os.path.join( projdir, 'experiment\\checkpoints', args.exp_name+'.pth') Path(os.path.dirname(saveloadpath)).mkdir(exist_ok=True, parents=True) # timestamp = str(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M')) tblogdir = os.path.join( projdir, 'experiment\\tensorboardX', args.exp_name ) # + '_' + timestamp ) Path(tblogdir).mkdir(exist_ok=True, parents=True) # Create tb_writer(the writer will be used to write the information on tb) by using SummaryWriter, # flush_secs defines how much seconds need to wait for writing information. tb_writer = SummaryWriter( logdir=tblogdir, flush_secs=3, write_to_disk=True) ''' --- INIT DATASETS AND DATALOADER (FOR SINGLE EPOCH) --- ''' # Ideal for PointNet and pointLSTM - dataloader will return (B:batch, S:seq, C:features, N:points) dataTransformations = transforms.Compose([ ToSeries(), DataAugmentation(), Resampling(maxPointsPerFrame=10), ToTensor() ]) # Init nuScenes datasets nusc_train = NuScenes(version=args.nuscenes_train_dir, dataroot=args.nuscenes_dir, verbose=True) train_dataset = RadarClassDataset(nusc_train, categories=args.categories, sensors=args.sensors, transforms=dataTransformations, sequence_length=1) nusc_test = NuScenes(version=args.nuscenes_test_dir, dataroot=args.nuscenes_dir, verbose=True) test_dataset = RadarClassDataset(nusc_test, categories=args.categories, sensors=args.sensors, transforms=dataTransformations, sequence_length=1) # Init training data loader trainDataLoader = DataLoader(train_dataset, batch_size=args.batchsize, shuffle=True, num_workers=args.num_workers) ''' --- INIT NETWORK MODEL --- ''' # Load selected network model and put it to right device if args.model_name == 'pointnet': classifier = PointNetCls(dim=args.pointCoordDim, num_class=len(args.categories), feature_transform=args.feature_transform) elif args.model_name == 'pointnet2': classifier = PointNet2ClsMsg(dim=args.pointCoordDim, num_class=len(args.categories) ) else: raise Exception('Argument "model_name" does not match existent networks') classifier = classifier.to(device) ''' --- INIT LOSS FUNCTION --- ''' loss_fun = FocalLoss(gamma=args.focalLoss_gamma, num_classes=len(args.categories), alpha=args.weight_cat).to(device) ''' --- LOAD NETWORK IF EXISTS --- ''' if os.path.exists(saveloadpath): print('Using pretrained model found...') checkpoint = torch.load(saveloadpath) start_epoch = checkpoint['epoch'] +1 # Just becase make sure counting starts from 1, 2, ..., rather than 0, 1, ..., when print the information of start_epoch iteration = checkpoint['iteration'] best_test_acc = checkpoint['test_accuracy'] classifier.load_state_dict(checkpoint['model_state_dict']) else: print('No existing model, starting training from scratch...') start_epoch = 1 # Just becase make sure counting starts from 1, 2, ..., rather than 0, 1, ..., when print the information of start_epoch iteration = 1 # Just becase make sure counting starts from 1, 2, ..., rather than 0, 1, ..., when print the information of iteration best_test_acc = 0 ''' --- CREATE OPTIMIZER ---''' if args.optimizer == 'SGD': optimizer = torch.optim.SGD( classifier.parameters(), lr=args.lr, momentum=0.9) elif args.optimizer == 'ADAM': optimizer = torch.optim.Adam( classifier.parameters(), lr=args.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=args.decay_rate) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_epoch_half, gamma=0.5) # half(0.5) the learning rate every 'step_size' epochs # Log info printparams = 'Model parameters:' + json.dumps(vars(args), indent=4, sort_keys=True) print(printparams) tb_writer.add_text('hyper-parameters',printparams,iteration) # tb_writer.add_hparam(args) tb_writer.add_text('dataset','dataset sample size: training: {}, test: {}'.format(len(train_dataset),len(test_dataset)),iteration) ''' --- START TRANING ---''' for epoch in range(start_epoch, args.epoch+1): # epoch = start_epoch print('Epoch %d/%s:' % (epoch, args.epoch)) # Add the "learning rate" into tensorboard scalar which will be shown in tensorboard tb_writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], iteration) # Beware epochs_left = args.epoch - epoch for batch_id, data in tqdm(enumerate(trainDataLoader, 0), total=len(trainDataLoader), smoothing=0.9): points, target = data # (B:batch x S:seq x C:features x N:points) , (B x S:seq) # Squeeze to drop Sequence dimension, which is equal to 1, convert all the data to float(otherwise there will be data type problems when running the model) and move to device points, target = points.squeeze(dim=1).float().to(device), target.float().to(device) # (B:batch x C:features x N:points) , (B) # points, target = points.float().to(device), target.float().to(device) # Reset gradients optimizer.zero_grad() # Sets the module in training mode classifier = classifier.train() # Forward propagation pred = classifier(points) # MLE estimator = min (- log (softmax(x)) ) = min nll_loss(log_softmax(x)) # loss = F.nll_loss(pred, target.long()) loss = loss_fun(pred, target.long()) if args.model_name == 'pointnet': loss += feature_transform_regularizer(classifier.trans) * 0.001 if args.feature_transform: loss += feature_transform_regularizer(classifier.trans_feat) * 0.001 # Back propagate loss.backward() # Update weights optimizer.step() # Log once for every 5 batches, add the "train_loss/cross_entropy" into tensorboard scalar which will be shown in tensorboard if not batch_id % 5: tb_writer.add_scalar('train_loss/cross_entropy', loss.item(), iteration) iteration += 1 # Plot train confusion matrix every X steps if not iteration % 20: confmatrix_train = metrics_confusion_matrix(target, pred) print('\nTrain confusion matrix: \n',confmatrix_train) # We just finished one epoch # if not batch_id+1 % int(train_dataset.len__()/args.batchsize): ''' --- TEST NETWORK --- ''' if not epoch % int(args.test_every_X_epochs): # Doing the following things every epoch. # Perform predictions on the training data. train_targ, train_pred = test(classifier, train_dataset, device, num_workers=0, batch_size=512) # Perform predictions on the testing data. test_targ, test_pred = test(classifier, test_dataset, device, num_workers=0, batch_size=512) # Calculate the accuracy rate for training data. train_acc = metrics_accuracy(train_targ, train_pred) # Calculate the accuracy rate for testing data. test_acc = metrics_accuracy(test_targ, test_pred) print('\r Training loss: {}'.format(loss.item())) print('Train Accuracy: {}\nTest Accuracy: {}'.format(train_acc, test_acc) ) # Add the "train_acc" "test_acc" into tensorboard scalars which will be shown in tensorboard. tb_writer.add_scalars('metrics/accuracy', {'train':train_acc, 'test':test_acc}, iteration) # Calculate confusion matrix. confmatrix_test = metrics_confusion_matrix(test_targ, test_pred) print('Test confusion matrix: \n',confmatrix_test) # Log confusion matrix. fig, ax = plot_confusion_matrix(confmatrix_test, args.categories, normalize=False, title='Test Confusion Matrix') # Log normalized confusion matrix. fig_n, ax_n = plot_confusion_matrix(confmatrix_test, args.categories, normalize=True, title='Test Confusion Matrix - Normalized') # Add the "confusion matrix" "normalized confusion matrix" into tensorboard figure which will be shown in tensorboard. tb_writer.add_figure('test_confusion_matrix/abs', fig, global_step=iteration, close=True) tb_writer.add_figure('test_confusion_matrix/norm', fig_n, global_step=iteration, close=True) # Log precision recall curves. for idx, clsname in enumerate(args.categories): # Convert log_softmax to softmax(which is actual probability) and select the desired class. test_pred_binary = torch.exp(test_pred[:,idx]) test_targ_binary = test_targ.eq(idx) # Add the "precision recall curves" which will be shown in tensorboard. tb_writer.add_pr_curve(tag='pr_curves/'+clsname, labels=test_targ_binary, predictions=test_pred_binary, global_step=iteration) # Store the best test accuracy if (test_acc >= best_test_acc): best_test_acc = max([best_test_acc, test_acc]) # NOTE: we possibly want to save only when when the best test accuracy is surpassed. For now lets save every X epoch ''' --- SAVE NETWORK --- ''' if not epoch % int(args.save_every_X_epochs): print('Best Accuracy: %f'%best_test_acc) state = { 'epoch': epoch, 'iteration': iteration, 'test_accuracy': best_test_acc, 'model_state_dict': classifier.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), } torch.save(state, saveloadpath) print('Model saved!!!') # epoch += 1 # print('Epoch %d/%s:' % (epoch, args.epoch)) scheduler.step() tb_writer.close()
loss = F.binary_cross_entropy(y_pred, labels) if i%50 == 0: print ("epoch ", epoch_id, " loss", i, loss.item()) optimizer.zero_grad() loss.backward() optimizer.step() for name, param in model.named_parameters(): if debug_gradient: if param.requires_grad: print (name, "\n", param.data, "\n", "grad", param.grad) writer.add_pr_curve("pr_curve, epoch_id:" + str(epoch_id), valid_y, model(valid_X)) writer.add_scalars('loss', {'training': F.binary_cross_entropy(model(train_X), train_y), 'validation': F.binary_cross_entropy(model(valid_X), valid_y)}, epoch_id) print ("Evaluating after training") y_pred = model(test_X) loss = F.binary_cross_entropy(y_pred, test_y) y_pred_numpy = y_pred.detach().numpy() test_y_numpy = test_y.detach().numpy() writer.add_scalars('precision/recall/f1', { 'precision': precision_score(test_y_numpy, y_pred_numpy > 0.5, average='samples'), 'recall': recall_score(test_y_numpy, y_pred_numpy > 0.5, average='samples'), 'f1_score': f1_score(test_y_numpy, y_pred_numpy > 0.5, average='samples')}, 1)
class Unet(nn.Module): def __init__(self, nb_classes, experiment, device, c_in=1, nb_blocks=4, nb_layers=2, nb_channels=8): self.nb_classes = nb_classes self.nb_blocks = nb_blocks self.nb_layers = nb_layers self.c_in = c_in self.c_ker = nb_channels self.experiment = experiment self.device = device super(Unet, self).__init__() block = [] # Downsampling for _ in range(self.nb_blocks): block += block_downsampling(self.nb_layers, self.c_in, self.c_ker) self.c_in = self.c_ker self.c_ker *= 2 self.down = nn.Sequential(*block) bottom = [] # In-between downsampling and upsampling for _ in range(self.nb_layers): bottom.append( nn.Conv2d(self.c_in, self.c_ker, (3, 3), stride=1, padding=1)) bottom.append(nn.ReLU()) self.c_in = self.c_ker self.bottom = nn.Sequential(*bottom) block = [] # Upsampling for _ in range(self.nb_blocks): block += block_upsampling(self.nb_layers, self.c_in, self.c_ker) self.c_ker //= 2 self.c_in = self.c_ker self.up = nn.Sequential(*block) # Last step self.lastConv = nn.Conv2d(self.c_in, self.nb_classes, (3, 3), stride=1, padding=1) # Resizing for targets self.avgPool = nn.AvgPool2d((2, 2)) self.activation = nn.Sigmoid() # Weight initialization for m in self.modules(): if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d): nn.init.kaiming_normal_(m.weight) def forward(self, x): target_shape = x.shape skip_connections = [] # Downsampling for mod in list(self.down.modules())[1:]: if isinstance(mod, nn.MaxPool2d): skip_connections.append(x) x = mod(x) # In-between downsampling and upsampling for mod in list(self.bottom.modules())[1:]: x = mod(x) # Upsampling for mod in list(self.up.modules())[1:]: x = mod(x) if isinstance(mod, nn.ConvTranspose2d): last = skip_connections.pop() if last.shape != x.shape: x = F.pad(x, (0, last.shape[-1] - x.shape[-1], last.shape[-2] - x.shape[-2], 0), mode='constant', value=0) x = torch.cat((x, last), dim=1) # Last step x = self.lastConv(x) # Resizing for targets if x.shape != target_shape: x = self.avgPool(x) return x # Method used to train the model and evaluate it at each epoch def train_model(self, data_loader, nb_epoch, lr): self.train(True) criterion = nn.BCEWithLogitsLoss() optimizer = optim.Adam(self.parameters(), lr=lr) self.best_loss = np.inf # Initialize a writer to output logs for graphs in tensorboard self.writer = SummaryWriter( os.path.join(self.experiment, 'train', 'logs')) self.save_path = os.path.join(self.experiment, 'train', 'models') print('Models will be saved to {}/{}'.format(os.path.dirname(__file__), self.save_path)) print('Start training...') for epoch in range(nb_epoch): self.current_epoch = epoch + 1 print('Epoch n°{}'.format(self.current_epoch)) self.train_epoch(data_loader, optimizer, criterion) # Save model every 1/5th of the total number of epochs if self.current_epoch % (nb_epoch // 5) == 0: print('Model saved.') torch.save( self.state_dict(), os.path.join(self.save_path, 'model{}.pth'.format(self.current_epoch))) self.writer.export_scalars_to_json( os.path.join(self.experiment, 'train', 'logs', 'scalar_hist.json')) self.writer.close() def train_epoch(self, data_loader, optimizer, criterion): for p, phase in enumerate(['train', 'val']): # Loss and accuracy for the current epoch at each phase running_loss = 0.0 running_accuracy = 0.0 running_recall = 0.0 running_precision = 0.0 # Estimation of center of grapes centers_pred = np.array([]) centers_data = np.array([]) centers_true = np.array([]) # Only enable gradients for training torch.set_grad_enabled((phase == 'train')) for n_batch, data in enumerate(data_loader[p]): # Read the data inputs = data['image'].to(self.device) target = data['target'].to(self.device) # Forward path + loss computing output = self(inputs) loss = criterion(output, target) running_loss += loss.item() # Zero the parameter gradients and optimize the weights if phase == 'train': optimizer.zero_grad() loss.backward() optimizer.step() threshold = 0.5 # Threshold for the activation function scores = self.activation(output) pred = (scores > threshold).float() # Compute TP, FP, FN and FN for precision, recall and accuracy # When 1 pixels are equal tp = torch.sum((pred == 1) * (target == 1)).item() # Don't reward detection of berries' visible pixels tp -= torch.sum((pred == 1) * (inputs == 1)).item() # When 1 pixels are on the background fp = torch.sum((pred == 1) * (target == 0)).item() # When 0 pixels are on the berry fn = torch.sum((pred == 0) * (target == 1)).item() # When 0 pixels are on the background tn = torch.sum((pred == 0) * (target == 0)).item() running_precision += tp / (tp + fp) if (tp + fp) != 0 else 0 running_recall += tp / (tp + fn) if (tp + fn) != 0 else 0 running_accuracy += (tp + tn) / (tp + tn + fp + fn) # Retrieve centers for estimation center error centers_data = np.append(centers_data, data['image_center'].numpy()) centers_true = np.append(centers_true, data['target_center'].numpy()) for j in range(inputs.shape[0]): cX, cY = find_center( pred[j, 0].cpu().numpy().astype(np.uint8), 'pred') centers_pred = np.append(centers_pred, np.array([cX, cY])) break # Normalize metrics running_loss /= (n_batch + 1) running_precision /= (n_batch + 1) running_recall /= (n_batch + 1) running_accuracy /= (n_batch + 1) # Compute other metrics if (running_precision + running_recall) != 0: f_score = (2*running_precision*running_recall) / \ (running_precision+running_recall) else: f_score = 0 pr_prec, pr_rec, _ = precision_recall_curve( y_true=target.view(-1).cpu().numpy(), probas_pred=scores.view(-1).detach().cpu().numpy()) auc_score = auc(pr_rec, pr_prec) ap_score = average_precision_score( y_true=target.view(-1).cpu().numpy(), y_score=pred.view(-1).cpu().numpy()) # Compute estimation centers error with L1-norm centers_data = centers_data.reshape(-1, 2) centers_true = centers_true.reshape(-1, 2) centers_pred = centers_pred.reshape(-1, 2) l1_dist_baseline = np.abs(centers_true - centers_data) l1_dist_baseline = np.sum(l1_dist_baseline, axis=1) l1_dist_baseline = np.mean(l1_dist_baseline) l1_dist_pred = np.abs(centers_true - centers_pred) l1_dist_pred = np.sum(l1_dist_pred, axis=1) l1_dist_pred = np.mean(l1_dist_pred) # Matplotlib figure of the predictions stored on tensorboard fig = prediction_figure(pred=pred, inputs=inputs, target=target, phase=phase, epoch=self.current_epoch) # Write computed metrics in tensorboard self.writer.add_scalar('{}/accuracy'.format(phase), running_accuracy, self.current_epoch) self.writer.add_scalar('{}/auc'.format(phase), auc_score, self.current_epoch) self.writer.add_scalar('{}/aver_prec'.format(phase), ap_score, self.current_epoch) self.writer.add_scalar('{}/l1-error-baseline'.format(phase), l1_dist_baseline, self.current_epoch) self.writer.add_scalar('{}/l1-error-pred'.format(phase), l1_dist_pred, self.current_epoch) self.writer.add_scalar('{}/f-score'.format(phase), f_score, self.current_epoch) self.writer.add_scalar('{}/loss'.format(phase), running_loss, self.current_epoch) self.writer.add_scalar('{}/precision'.format(phase), running_precision, self.current_epoch) self.writer.add_scalar('{}/recall'.format(phase), running_recall, self.current_epoch) self.writer.add_pr_curve('{}/pr_curve'.format(phase), target.view(-1), pred.view(-1), self.current_epoch) self.writer.add_figure('{}/prediction'.format(phase), fig, self.current_epoch) if phase == 'val': # Save the model with the best loss if running_loss < self.best_loss: self.best_loss = running_loss print('Save best model.') torch.save(self.state_dict(), os.path.join(self.save_path, 'best_model.pth')) def predict(self, test_loader, threshold): self.eval() # Predictions and labels for the confusion matrix y_true = np.array([]) y_pred = np.array([]) # Scores for the PR-Curve y_score = np.array([]) # Estimation of center of grapes centers_pred = np.array([]) centers_data = np.array([]) centers_true = np.array([]) with torch.no_grad(): # Solely for evaluation purpose if isinstance(test_loader, torch.utils.data.dataloader.DataLoader): print('Start evaluation...') for i, data in enumerate(test_loader): # Read the data inputs = data['image'].to(self.device) # Forward path output = self(inputs) scores = self.activation(output) pred = (scores > threshold).float() target = data['target'] idx = ((pred == 1) * (inputs == 1)) preds = pred[~idx] target = target[~idx] scores = scores[~idx] y_pred = np.append(y_pred, preds.view(-1).cpu().numpy()) y_true = np.append(y_true, target.view(-1).numpy()) y_score = np.append(y_score, scores.view(-1).cpu().numpy()) centers_data = np.append(centers_data, data['image_center'].numpy()) centers_true = np.append(centers_true, data['target_center'].numpy()) for j in range(inputs.shape[0]): cX, cY = find_center( pred[j, 0].cpu().numpy().astype(np.uint8), 'pred') if cX != 0 and cY != 0: centers_pred = np.append(centers_pred, np.array([cX, cY])) else: centers_pred = np.append( centers_pred, data['image_center'][j].numpy()) if (i + 1) % (len(test_loader) // 5) == 0: print('Done: {}/{}'.format(i + 1, len(test_loader))) print('Evaluation is finished. Metrics are being computed...') save_classification_report( y_true, y_pred, threshold, os.path.join(self.experiment, 'eval', 'class_rep.png')) centers_data = centers_data.reshape(-1, 2) centers_true = centers_true.reshape(-1, 2) centers_pred = centers_pred.reshape(-1, 2) l1_dist_baseline = np.abs(centers_true - centers_data) l1_dist_baseline = np.sum(l1_dist_baseline, axis=1) l1_dist_baseline = np.mean(l1_dist_baseline) l1_dist_pred = np.abs(centers_true - centers_pred) l1_dist_pred = np.sum(l1_dist_pred, axis=1) l1_dist_pred = np.mean(l1_dist_pred) print('Baseline center error: {}'.format(l1_dist_baseline)) print('Center prediction error: {}'.format(l1_dist_pred)) print('Classification report plot saved successfully.') save_pr_curve_plot( y_true, y_score, os.path.join(self.experiment, 'eval', 'pr_cruve.html')) print('PR Curve plot saved successfully.') else: print('Start amodal completion...') # When testing in real time, not with synthetic dataset; # must be a tensor of shape [BxCxHxW] pred = torch.empty_like(test_loader) orig_shape = test_loader.shape for b, img in enumerate(test_loader): img = rescale(img, (225, 325)) output = self(img.to(self.device)) res = (self.activation(output) > threshold).float() res = rescale(res.squeeze().cpu(), orig_shape[2:]) pred[b] = res.bool() return pred
"xcosx": n_iter * np.cos(n_iter), "arctanx": np.arctan(n_iter) }, n_iter) x = torch.rand(32, 3, 64, 64) # output from network if n_iter % 10 == 0: x = vutils.make_grid(x, normalize=True, scale_each=True) writer.add_image('Image', x, n_iter) x = torch.zeros(sample_rate * 2) for i in range(x.size(0)): x[i] = np.cos( freqs[n_iter // 10] * np.pi * float(i) / float(sample_rate)) # sound amplitude should in [-1, 1] writer.add_audio('myAudio', x, n_iter, sample_rate=sample_rate) writer.add_text('Text', 'text logged at step:' + str(n_iter), n_iter) for name, param in vgg16.named_parameters(): writer.add_histogram(name, param.clone().cpu().data.numpy(), n_iter) writer.add_pr_curve('xoxo', np.random.randint(2, size=100), np.random.rand(100), n_iter) #needs tensorboard 0.4RC or later dataset = datasets.MNIST('mnist', train=False, download=True) images = dataset.test_data[:100].float() label = dataset.test_labels[:100] features = images.view(100, 784) writer.add_embedding(features, metadata=label, label_img=images.unsqueeze(1)) # export scalar data to JSON for external processing writer.export_scalars_to_json("./all_scalars.json") writer.close()
def run(args): print('Task 1: clear cell grade prediction') path = '/data/larson2/RCC_dl/new/clear_cell/' transform = { 'train': transforms.Compose([ transforms.Lambda(lambda x: torch.Tensor(x)), src.dataloader.Rescale(-160, 240, zero_center=True), # rset dynamic range transforms.Lambda( lambda x: x.repeat(3, 1, 1, 1).permute(3, 0, 1, 2)), # src.dataloader.Normalize(), # src.dataloader.Crop(110), # src.dataloader.RandomCenterCrop(90), src.dataloader.RandomHorizontalFlip(), # src.dataloader.RandomRotate(25), src.dataloader.Resize(256) ]), 'val': transforms.Compose([ transforms.Lambda(lambda x: torch.Tensor(x)), src.dataloader.Rescale(-160, 240, zero_center=True), # rset dynamic range transforms.Lambda( lambda x: x.repeat(3, 1, 1, 1).permute(3, 0, 1, 2)), # src.dataloader.Normalize(), # src.dataloader.Crop(90), src.dataloader.Resize(256) ]) } my_dataset = { 'train': src.dataloader.RCCDataset_h5(path, mode='train', transform=transform['train']), 'val': src.dataloader.RCCDataset_h5(path, mode='val', transform=transform['train']) } my_loader = { x: DataLoader(my_dataset[x], batch_size=1, shuffle=True, num_workers=4) for x in ['train', 'val'] } print('train size: ', len(my_loader['train'])) print('train size: ', len(my_loader['val'])) ### Some Checkers print('Summary: ') print('\ttrain size: ', len(my_loader['train'])) print('\ttrain size: ', len(my_loader['val'])) print('\tDatatype = ', next(iter(my_loader['train']))[0].dtype) print('\tMin = ', next(iter(my_loader['train']))[0].min()) print('\tMax = ', next(iter(my_loader['train']))[0].max()) print('\tInput size', next(iter(my_loader['train']))[0].shape) # print('\tweight = ', args.weight) ### Tensorboard Log Setup log_root_folder = "/data/larson2/RCC_dl/logs/" now = datetime.now() now = now.strftime("%Y%m%d-%H%M%S") logdir = os.path.join( log_root_folder, f"{now}_model_{args.model}_{args.prefix_name}_epoch_{args.epochs}_weight_{args.weight}_lr_{args.lr}_gamma_{args.gamma}_lrsche_{args.lr_scheduler}_{now}" ) # os.makedirs(logdir) print(f'\tlogdir = {logdir}') writer = SummaryWriter(logdir) ### Model Selection device = torch.device( "cuda:{}".format(args.gpu) if torch.cuda.is_available() else "cpu") model = src.model.TDNet() model = model.to(device) writer.add_graph(model, my_dataset['train'][0][0].to(device)) print('\tCuda:', torch.cuda.is_available(), f'\n\tdevice = {device}') optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.1) if args.lr_scheduler == "plateau": scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=.3, threshold=1e-4, verbose=True) elif args.lr_scheduler == "step": scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=args.gamma) pos_weight = torch.FloatTensor([args.weight]).to(device) criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight) ### Ready? best_val_loss = float('inf') best_val_auc = float(0) best_model_wts = copy.deepcopy(model.state_dict()) iteration_change_loss = 0 t_start_training = time.time() ### Here we go for epoch in range(args.epochs): current_lr = get_lr(optimizer) t_start = time.time() epoch_loss = {'train': 0., 'val': 0.} epoch_corrects = {'train': 0., 'val': 0.} epoch_acc = 0.0 epoch_AUC = 0.0 for phase in ['train', 'val']: if phase == 'train': if args.lr_scheduler == "step": scheduler.step() model.train() else: model.eval() running_losses = [] running_corrects = 0. y_trues = [] y_probs = [] y_preds = [] print('lr: ', current_lr) for i, (inputs, labels, header) in enumerate(my_loader[phase]): optimizer.zero_grad() inputs = inputs.to(device) labels = labels.to(device) # forward # track history only in train with torch.set_grad_enabled(phase == 'train'): outputs = model(inputs.float()) # raw logits probs = torch.sigmoid( outputs) # [0, 1] probability, shape = s * 1 preds = torch.round( probs ) # 0 or 1, shape = s * 1, prediction for each slice pt_pred, _ = torch.mode( preds, 0 ) # take majority vote, shape = 1, prediction for each patient count0 = (preds == 0).sum().float() count1 = (preds == 1).sum().float() pt_prob = count1 / (preds.shape[0]) # convert label to slice level loss = criterion(outputs, labels.repeat( inputs.shape[1], 1)) # inputs shape = 1*s*3*256*256 # backward + optimize only if in training phases if phase == 'train': loss.backward() optimizer.step() # multiple loss by slice num per batch? running_losses.append(loss.item()) # * inputs.size(0) running_corrects += torch.sum(preds == labels.data) y_trues.append(int(labels.item())) y_probs.append(pt_prob.item()) # use ratio to get probability y_preds.append(pt_pred.item()) writer.add_scalar(f'{phase}/Loss', loss.item(), epoch * len(my_loader[phase]) + i) writer.add_pr_curve('{phase}pr_curve', y_trues, y_probs, 0) if (i % args.log_every == 0) & (i > 0): print( 'Epoch: {0}/{1} | Single batch number : {2}/{3} | avg loss:{4} | Acc: {5:.4f} | lr: {6}' .format(epoch + 1, args.epochs, i, len(my_loader[phase]), np.round(np.mean(running_losses), 4), (running_corrects / len(my_loader[phase])), current_lr)) # epoch statistics epoch_loss[phase] = np.round(np.mean(running_losses), 4) epoch_corrects[phase] = (running_corrects / len(my_loader[phase])) cm = confusion_matrix(y_trues, y_preds, labels=[0, 1]) src.helper.print_cm(cm, ['0', '1']) sens, spec, acc = src.helper.compute_stats(y_trues, y_preds) print('sens: {:.4f}'.format(sens)) print('spec: {:.4f}'.format(spec)) print('acc: {:.4f}'.format(acc)) print() print( '\ Summary train loss: {0} | val loss: {1} | train acc: {2:.4f} | val acc: {3:.4f}' .format(epoch_loss['train'], epoch_loss['val'], epoch_corrects['train'], epoch_corrects['val'])) print('-' * 30)
def main(args): ''' --- SELECT DEVICES --- ''' # Select either gpu or cpu device = torch.device("cuda" if args.cuda else "cpu") # Select among available GPUs if args.cuda: os.environ["CUDA_VISIBLE_DEVICES"] = ','.join( str(x) for x in args.gpudevice) ''' --- CREATE EXPERIMENTS DIRECTORY AND LOGGERS IN TENSORBOARD --- ''' projdir = sys.path[0] # Path for saving and loading the network. saveloadpath = os.path.join(projdir, 'experiment\\checkpoints', args.exp_name + '.pth') Path(os.path.dirname(saveloadpath)).mkdir(exist_ok=True, parents=True) # timestamp = str(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M')) tblogdir = os.path.join(projdir, 'experiment\\tensorboardX', args.exp_name) # + '_' + timestamp ) Path(tblogdir).mkdir(exist_ok=True, parents=True) # Create tb_writer(the writer will be used to write the information on tb) by using SummaryWriter, # flush_secs defines how much seconds need to wait for writing information. tb_writer = SummaryWriter(logdir=tblogdir, flush_secs=3, write_to_disk=True) ''' --- INIT DATASETS AND DATALOADER (FOR SINGLE EPOCH) --- ''' # Read data from file, and create training data and testing data which are both in multiple frames. Beware Ts is # recording for every frame, i.e. every 82ms the automotive radar records once to form single frame(We need this information for LSTM). train_dataset, test_dataset, class_names = read_dataset( args.datapath, Ts=0.082, train_test_split=0.8) # Prepare the traing and testing dataset. both trainDataset and testDataset are dataset have multiple frames data, # for each frame it contains the "unified number of detection points"(NMAX detection points per frame). # Init test dataset(Beware we should NOT use data augmentation for test dataset) test_dataTransformations = transforms.Compose( [NormalizeTime(), Resampling(maxPointsPerFrame=10)]) testDataset = RadarClassDataset(dataset=test_dataset, transforms=test_dataTransformations, sequence_length=1) # Init train datasets train_dataTransformations = transforms.Compose([ NormalizeTime(), DataAugmentation(), Resampling(maxPointsPerFrame=10) ]) trainDataset = RadarClassDataset(dataset=train_dataset, transforms=train_dataTransformations, sequence_length=1) # Create dataloader for training by using batch_size frames' data in each batch trainDataLoader = DataLoader(trainDataset, batch_size=args.batchsize, shuffle=True, num_workers=args.num_workers) ''' --- INIT NETWORK MODEL --- ''' # Load selected network model and put it to right device if args.model_name == 'pointnet': classifier = PointNetCls(dim=args.pointCoordDim, num_class=args.numclasses, feature_transform=args.feature_transform) elif args.model_name == 'pointnet2': classifier = PointNet2ClsMsg( dim=args.pointCoordDim, num_class=args.numclasses, ) else: raise Exception( 'Argument "model_name" does not match existent networks') classifier = classifier.to(device) ''' --- LOAD NETWORK IF EXISTS --- ''' if os.path.exists(saveloadpath): print('Using pretrained model found...') checkpoint = torch.load(saveloadpath) start_epoch = checkpoint[ 'epoch'] + 1 # Just becase make sure counting starts from 1, 2, ..., rather than 0, 1, ..., when print the information of start_epoch iteration = checkpoint['iteration'] best_test_acc = checkpoint['test_accuracy'] classifier.load_state_dict(checkpoint['model_state_dict']) else: print('No existing model, starting training from scratch...') start_epoch = 1 # Just becase make sure counting starts from 1, 2, ..., rather than 0, 1, ..., when print the information of start_epoch iteration = 1 # Just becase make sure counting starts from 1, 2, ..., rather than 0, 1, ..., when print the information of iteration best_test_acc = 0 ''' --- CREATE OPTIMIZER ---''' if args.optimizer == 'SGD': optimizer = torch.optim.SGD(classifier.parameters(), lr=args.lr, momentum=0.9) elif args.optimizer == 'ADAM': optimizer = torch.optim.Adam(classifier.parameters(), lr=args.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=args.decay_rate) scheduler = torch.optim.lr_scheduler.StepLR( optimizer, step_size=args.lr_epoch_half, gamma=0.5) # half(0.5) the learning rate every 'step_size' epochs # log info printparams = 'Model parameters:' + json.dumps( vars(args), indent=4, sort_keys=True) print(printparams) tb_writer.add_text('hyper-parameters', printparams, iteration) # tb_writer.add_hparam(args) tb_writer.add_text( 'dataset', 'dataset sample size: training: {}, test: {}'.format( train_dataset.shape[0], test_dataset.shape[0]), iteration) ''' --- START TRANING ---''' for epoch in range(start_epoch, args.epoch + 1): print('Epoch %d/%s:' % (epoch, args.epoch)) # Add the "learning rate" into tensorboard scalar which will be shown in tensorboard tb_writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], iteration) for batch_id, data in tqdm(enumerate(trainDataLoader, 0), total=len(trainDataLoader), smoothing=0.9): points, target = data # (B:batch x S:seq x C:features x N:points) , (B x S:seq) # Squeeze to drop Sequence dimension, which is equal to 1, convert all the data to float(otherwise there will be data type problems when running the model) and move to device points, target = points.squeeze( dim=1).float().to(device), target.float().to( device) # (B:batch x C:features x N:points) , (B) # points, target = points.float().to(device), target.float().to(device) # Reset gradients optimizer.zero_grad() # Sets the module in training mode classifier = classifier.train() # Forward propagation pred = classifier(points) # Calculate cross entropy loss (In the pointnet/pointnet2 network model, it outputs log_softmax result. Since # "log_softmax -> nll_loss" == CrossEntropyLoss, so that we just need to call F.nll_loss) loss = F.nll_loss(pred, target.long()) if args.model_name == 'pointnet': loss += feature_transform_regularizer(classifier.trans) * 0.001 if args.feature_transform: loss += feature_transform_regularizer( classifier.trans_feat) * 0.001 # Back propagate loss.backward() # Update weights optimizer.step() # Log once for every 5 batches, add the "train_loss/cross_entropy" into tensorboard scalar which will be shown in tensorboard if not batch_id % 5: tb_writer.add_scalar('train_loss/cross_entropy', loss.item(), iteration) iteration += 1 # if batch_id> 2: break scheduler.step() ''' --- TEST AND SAVE NETWORK --- ''' if not epoch % 10: # Doing the following things every epoch. # Perform predictions on the training data. train_targ, train_pred = test(classifier, trainDataset, device, num_workers=args.num_workers, batch_size=1800) # Perform predictions on the testing data. test_targ, test_pred = test(classifier, testDataset, device, num_workers=args.num_workers, batch_size=1800) # Calculate the accuracy rate for training data. train_acc = metrics_accuracy(train_targ, train_pred) # Calculate the accuracy rate for testing data. test_acc = metrics_accuracy(test_targ, test_pred) print('\r Training loss: {}'.format(loss.item())) print('Train Accuracy: {}\nTest Accuracy: {}'.format( train_acc, test_acc)) # Add the "train_acc" "test_acc" into tensorboard scalars which will be shown in tensorboard. tb_writer.add_scalars('metrics/accuracy', { 'train': train_acc, 'test': test_acc }, iteration) # Calculate confusion matrix. confmatrix_test = metrics_confusion_matrix(test_targ, test_pred) print('Test confusion matrix: \n', confmatrix_test) # Log confusion matrix. fig, ax = plot_confusion_matrix(confmatrix_test, class_names, normalize=False, title='Test Confusion Matrix') # Log normalized confusion matrix. fig_n, ax_n = plot_confusion_matrix( confmatrix_test, class_names, normalize=True, title='Test Confusion Matrix - Normalized') # Add the "confusion matrix" "normalized confusion matrix" into tensorboard figure which will be shown in tensorboard. tb_writer.add_figure('test_confusion_matrix/abs', fig, global_step=iteration, close=True) tb_writer.add_figure('test_confusion_matrix/norm', fig_n, global_step=iteration, close=True) # Log precision recall curves. for idx, clsname in enumerate(class_names): # Convert log_softmax to softmax(which is actual probability) and select the desired class. test_pred_binary = torch.exp(test_pred[:, idx]) test_targ_binary = test_targ.eq(idx) # Add the "precision recall curves" which will be shown in tensorboard. tb_writer.add_pr_curve(tag='pr_curves/' + clsname, labels=test_targ_binary, predictions=test_pred_binary, global_step=iteration) ''' --- SAVE NETWORK --- ''' # if (test_acc >= best_test_acc): # For now lets save every time, since we are only testing in a subset of the test dataset best_test_acc = test_acc # if test_acc > best_test_acc else best_test_acc state = { 'epoch': epoch, 'iteration': iteration, 'train_accuracy': train_acc if args.train_metric else 0.0, 'test_accuracy': best_test_acc, 'model_state_dict': classifier.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), } torch.save(state, saveloadpath) print('Model saved!!!') print('Best Accuracy: %f' % best_test_acc) tb_writer.close()
class Training(object): def __init__(self, config, logger=None): if logger is None: logger = logging.getLogger('logger') logger.setLevel(logging.DEBUG) logging.basicConfig(format='%(message)s', level=logging.DEBUG) self.logger = logger self.config = config self.classes = list(config.id2label.keys()) self.num_classes = config.num_classes self.embedder = Embedder(self.config) self.encoder = LSTMEncoder(self.config) self.clf = Classifier(self.config) self.clf_loss = SequenceCriteria(class_weight=None) if self.config.lambda_ae > 0: self.ae = AEModel(self.config) self.writer = SummaryWriter(log_dir="TFBoardSummary") self.global_steps = 0 self.enc_clf_opt = Adam(self._get_trainabe_modules(), lr=self.config.lr, betas=(config.beta1, config.beta2), weight_decay=config.weight_decay, eps=config.eps) if config.scheduler == "ReduceLROnPlateau": self.scheduler = lr_scheduler.ReduceLROnPlateau( self.enc_clf_opt, mode='max', factor=config.lr_decay, patience=config.patience, verbose=True) elif config.scheduler == "ExponentialLR": self.scheduler = lr_scheduler.ExponentialLR(self.enc_clf_opt, gamma=config.gamma) self._init_or_load_model() if config.multi_gpu: self.embedder.cuda() self.encoder.cuda() self.clf.cuda() self.clf_loss.cuda() if self.config.lambda_ae > 0: self.ae.cuda() self.ema_embedder = ExponentialMovingAverage(decay=0.999) self.ema_embedder.register(self.embedder.state_dict()) self.ema_encoder = ExponentialMovingAverage(decay=0.999) self.ema_encoder.register(self.encoder.state_dict()) self.ema_clf = ExponentialMovingAverage(decay=0.999) self.ema_clf.register(self.clf.state_dict()) self.time_s = time() def _get_trainabe_modules(self): param_list = list(self.embedder.parameters()) + \ list(self.encoder.parameters()) + \ list(self.clf.parameters()) if self.config.lambda_ae > 0: param_list += list(self.ae.parameters()) return param_list def _get_l2_norm_loss(self): total_norm = 0. for p in self._get_trainabe_modules(): param_norm = p.data.norm(p=2) total_norm += param_norm # ** 2 return total_norm # / 2. def _init_or_load_model(self): # if not self._load_model(): ensure_directory(self.config.output_path) self.epoch = 0 self.best_accuracy = -np.inf def _compute_vocab_freq(self, train_, dev_): counter = collections.Counter() for _, ids_ in train_: counter.update(ids_) for _, ids_ in dev_: counter.update(ids_) word_freq = np.zeros(self.config.n_vocab) for index_, freq_ in counter.items(): word_freq[index_] = freq_ return torch.from_numpy(word_freq).type(batch_utils.FLOAT_TYPE) def _save_model(self): state = { 'epoch': self.epoch, 'state_dict_encoder': self.ema_encoder.shadow_variable_dict, # self.encoder.state_dict(), 'state_dict_embedder': self.ema_embedder.shadow_variable_dict, # self.embedder.state_dict(), 'state_dict_clf': self.ema_clf.shadow_variable_dict, # self.clf.state_dict(), 'best_accuracy': self.best_accuracy } torch.save( state, os.path.join(self.config.output_path, self.config.model_file)) def _load_model(self): checkpoint_path = os.path.join(self.config.output_path, self.config.model_file) if self.config.load_checkpoint and os.path.isfile(checkpoint_path): # Code taken from here: https://github.com/pytorch/examples/blob/master/imagenet/main.py dict_ = torch.load(checkpoint_path) self.epoch = dict_['epoch'] self.best_accuracy = dict_['best_accuracy'] # self.embedder.load_state_dict(dict_['state_dict_embedder']) self.encoder.load_state_dict(dict_['state_dict_encoder']) self.clf.load_state_dict(dict_['state_dict_clf']) self.logger.info("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_path, self.epoch)) return True def __call__(self, train, dev, test, unlabel, addn, addn_un, addn_test, ek, ek_t, ek_u, graph_embs, graph_embs_t, graph_embs_u, addn_test_fr=None, addn_test_f=None, addn_test_r=None, mode="train", checkPth=None): self.logger.info('Start training') if (mode == "train"): self._train(train, dev, unlabel, addn, addn_un, addn_test, ek, ek_t, ek_u, graph_embs, graph_embs_t, graph_embs_u) if self.config.behaviour_test: self._evaluate(test, addn_test, ek_t, graph_embs_t, addn_test_fr, addn_test_f, addn_test_r) else: self._evaluate(test, addn_test, ek_t, graph_embs_t) else: model = torch.load(checkPth) self.embedder.load_state_dict(model['state_dict_embedder']) self.encoder.load_state_dict(model['state_dict_encoder']) self.clf.load_state_dict(model['state_dict_clf']) if self.config.behaviour_test: self._evaluate(test, addn_test, ek_t, graph_embs_t, addn_test_fr, addn_test_f, addn_test_r) else: self._evaluate(test, addn_test, ek_t, graph_embs_t) # self.encoder.eval() # self.clf.eval() # one, two = self.encoder(self.embedder(batch), batch) # pred = self.clf(one, addn_batch, ek_batch, graph_embs_batch, two) # accuracy = self.get_accuracy(cm, pred.data, batch.labels.data) def _create_iter(self, data_, wbatchsize, random_shuffler=None): iter_ = data.iterator.pool(data_, wbatchsize, key=lambda x: len(x[1]), batch_size_fn=batch_size_fn, random_shuffler=None) return iter_ def _run_epoch(self, train_data, dev_data, unlabel_data, addn_data, addn_data_unlab, addn_dev, ek, ek_t, ek_u, graph_embs, graph_embs_t, graph_embs_u): addn_dev.cuda() ek_t.cuda() graph_embs_t.cuda() report_stats = utils.Statistics() cm = ConfusionMatrix(self.classes) _, seq_data = list(zip(*train_data)) total_seq_words = len(list(itertools.chain.from_iterable(seq_data))) iter_per_epoch = (1.5 * total_seq_words) // self.config.wbatchsize self.encoder.train() self.clf.train() train_iter = self._create_iter(train_data, self.config.wbatchsize) unlabel_iter = self._create_iter(unlabel_data, self.config.wbatchsize_unlabel) sofar = 0 sofar_1 = 0 for batch_index, train_batch_raw in enumerate(train_iter): seq_iter = list(zip(*train_batch_raw))[1] seq_words = len(list(itertools.chain.from_iterable(seq_iter))) report_stats.n_words += seq_words self.global_steps += 1 # self.enc_clf_opt.zero_grad() if self.config.add_noise: train_batch_raw = add_noise(train_batch_raw, self.config.noise_dropout, self.config.random_permutation) train_batch = batch_utils.seq_pad_concat(train_batch_raw, -1) train_embedded = self.embedder(train_batch) memory_bank_train, enc_final_train = self.encoder( train_embedded, train_batch) if self.config.lambda_vat > 0 or self.config.lambda_ae > 0 or self.config.lambda_entropy: try: unlabel_batch_raw = next(unlabel_iter) except StopIteration: unlabel_iter = self._create_iter( unlabel_data, self.config.wbatchsize_unlabel) unlabel_batch_raw = next(unlabel_iter) if self.config.add_noise: unlabel_batch_raw = add_noise( unlabel_batch_raw, self.config.noise_dropout, self.config.random_permutation) unlabel_batch = batch_utils.seq_pad_concat( unlabel_batch_raw, -1) unlabel_embedded = self.embedder(unlabel_batch) memory_bank_unlabel, enc_final_unlabel = self.encoder( unlabel_embedded, unlabel_batch) addn_batch_unlab = retAddnBatch(addn_data_unlab, memory_bank_unlabel.shape[0], sofar_1).cuda() ek_batch_unlab = retAddnBatch(ek_u, memory_bank_unlabel.shape[0], sofar_1).cuda() graph_embs_unlab = retAddnBatch(graph_embs_u, memory_bank_unlabel.shape[0], sofar_1).cuda() sofar_1 += addn_batch_unlab.shape[0] if sofar_1 >= ek_u.shape[0]: sofar_1 = 0 addn_batch = retAddnBatch(addn_data, memory_bank_train.shape[0], sofar).cuda() ek_batch = retAddnBatch(ek, memory_bank_train.shape[0], sofar).cuda() graph_embs_batch = retAddnBatch(graph_embs, memory_bank_train.shape[0], sofar).cuda() sofar += addn_batch.shape[0] if sofar >= ek.shape[0]: sofar = 0 pred = self.clf(memory_bank_train, addn_batch, ek_batch, enc_final_train, graph_embs_batch) accuracy = self.get_accuracy(cm, pred.data, train_batch.labels.data) lclf = self.clf_loss(pred, train_batch.labels) lat = Variable( torch.FloatTensor([-1.]).type(batch_utils.FLOAT_TYPE)) lvat = Variable( torch.FloatTensor([-1.]).type(batch_utils.FLOAT_TYPE)) if self.config.lambda_at > 0: lat = at_loss( self.embedder, self.encoder, self.clf, train_batch, addn_batch, ek_batch, graph_embs_batch, perturb_norm_length=self.config.perturb_norm_length) if self.config.lambda_vat > 0: lvat_train = vat_loss( self.embedder, self.encoder, self.clf, train_batch, addn_batch, ek_batch, graph_embs_batch, p_logit=pred, perturb_norm_length=self.config.perturb_norm_length) if self.config.inc_unlabeled_loss: if memory_bank_unlabel.shape[0] != ek_batch_unlab.shape[0]: print( f'Skipping; Unequal Shapes: {memory_bank_unlabel.shape} and {ek_batch_unlab.shape}' ) continue else: lvat_unlabel = vat_loss( self.embedder, self.encoder, self.clf, unlabel_batch, addn_batch_unlab, ek_batch_unlab, graph_embs_unlab, p_logit=self.clf(memory_bank_unlabel, addn_batch_unlab, ek_batch_unlab, enc_final_unlabel, graph_embs_unlab), perturb_norm_length=self.config.perturb_norm_length ) if self.config.unlabeled_loss_type == "AvgTrainUnlabel": lvat = 0.5 * (lvat_train + lvat_unlabel) elif self.config.unlabeled_loss_type == "Unlabel": lvat = lvat_unlabel else: lvat = lvat_train lentropy = Variable( torch.FloatTensor([-1.]).type(batch_utils.FLOAT_TYPE)) if self.config.lambda_entropy > 0: lentropy_train = entropy_loss(pred) if self.config.inc_unlabeled_loss: lentropy_unlabel = entropy_loss( self.clf(memory_bank_unlabel, addn_batch_unlab, ek_batch_unlab, enc_final_unlabel, graph_embs_unlab)) if self.config.unlabeled_loss_type == "AvgTrainUnlabel": lentropy = 0.5 * (lentropy_train + lentropy_unlabel) elif self.config.unlabeled_loss_type == "Unlabel": lentropy = lentropy_unlabel else: lentropy = lentropy_train lae = Variable( torch.FloatTensor([-1.]).type(batch_utils.FLOAT_TYPE)) if self.config.lambda_ae > 0: lae = self.ae(memory_bank_unlabel, enc_final_unlabel, unlabel_batch.sent_len, unlabel_batch_raw) ltotal = (self.config.lambda_clf * lclf) + \ (self.config.lambda_ae * lae) + \ (self.config.lambda_at * lat) + \ (self.config.lambda_vat * lvat) + \ (self.config.lambda_entropy * lentropy) report_stats.clf_loss += lclf.data.cpu().numpy() report_stats.at_loss += lat.data.cpu().numpy() report_stats.vat_loss += lvat.data.cpu().numpy() report_stats.ae_loss += lae.data.cpu().numpy() report_stats.entropy_loss += lentropy.data.cpu().numpy() report_stats.n_sent += len(pred) report_stats.n_correct += accuracy self.enc_clf_opt.zero_grad() ltotal.backward() params_list = self._get_trainabe_modules() # Excluding embedder form norm constraint when AT or VAT if not self.config.normalize_embedding: params_list += list(self.embedder.parameters()) norm = torch.nn.utils.clip_grad_norm(params_list, self.config.max_norm) report_stats.grad_norm += norm self.enc_clf_opt.step() if self.config.scheduler == "ExponentialLR": self.scheduler.step() self.ema_embedder.apply(self.embedder.named_parameters()) self.ema_encoder.apply(self.encoder.named_parameters()) self.ema_clf.apply(self.clf.named_parameters()) report_func(self.epoch, batch_index, iter_per_epoch, self.time_s, report_stats, self.config.report_every, self.logger) if self.global_steps % self.config.eval_steps == 0: cm_, accuracy, prc_dev = self._run_evaluate( dev_data, addn_dev, ek_t, graph_embs_t) self.logger.info( "- dev accuracy {} | best dev accuracy {} ".format( accuracy, self.best_accuracy)) self.writer.add_scalar("Dev_Accuracy", accuracy, self.global_steps) pred_, lab_ = zip(*prc_dev) pred_ = torch.cat(pred_) lab_ = torch.cat(lab_) self.writer.add_pr_curve("Dev PR-Curve", lab_, pred_, self.global_steps) pprint.pprint(cm_) pprint.pprint(cm_.get_all_metrics()) if accuracy > self.best_accuracy: self.logger.info("- new best score!") self.best_accuracy = accuracy self._save_model() if self.config.scheduler == "ReduceLROnPlateau": self.scheduler.step(accuracy) self.encoder.train() # self.embedder.train() self.clf.train() if self.config.weight_decay > 0: print(">> Square Norm: %1.4f " % self._get_l2_norm_loss()) cm, train_accuracy, _ = self._run_evaluate(train_data, addn_data, ek, graph_embs) self.logger.info("- Train accuracy {}".format(train_accuracy)) pprint.pprint(cm.get_all_metrics()) cm, dev_accuracy, _ = self._run_evaluate(dev_data, addn_dev, ek_t, graph_embs_t) self.logger.info("- Dev accuracy {} | best dev accuracy {}".format( dev_accuracy, self.best_accuracy)) pprint.pprint(cm.get_all_metrics()) self.writer.add_scalars("Overall_Accuracy", { "Train_Accuracy": train_accuracy, "Dev_Accuracy": dev_accuracy }, self.global_steps) return dev_accuracy @staticmethod def get_accuracy(cm, output, target): batch_size = output.size(0) predictions = output.max(-1)[1].type_as(target) correct = predictions.eq(target) correct = correct.float() if not hasattr(correct, 'sum'): correct = correct.cpu() correct = correct.sum() cm.add_batch(target.cpu().numpy(), predictions.cpu().numpy()) return correct def _predict_batch(self, cm, batch, addn_batch, ek_batch, graph_embs_batch): # self.embedder.eval() self.encoder.eval() self.clf.eval() one, two = self.encoder(self.embedder(batch), batch) pred = self.clf(one, addn_batch, ek_batch, two, graph_embs_batch) torch.save(batch, 'co_attn_text.pth') torch.save(self.clf.co_attn_1.seq_len_weights, 'co_attn_weights_eval.pth') accuracy = self.get_accuracy(cm, pred.data, batch.labels.data) return pred, accuracy def chunks(self, l, n=15): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): yield l[i:i + n] def _run_evaluate(self, test_data, addn_test, ek_t, graph_embs_t): pr_curve_data = [] cm = ConfusionMatrix(self.classes) accuracy_list = [] # test_iter = self._create_iter(test_data, self.config.wbatchsize, # random_shuffler=utils.identity_fun) test_iter = self.chunks(test_data) for batch_index, test_batch in enumerate(test_iter): addn_batch = addn_test[batch_index * 15:(batch_index + 1) * 15] ek_batch = ek_t[batch_index * 15:(batch_index + 1) * 15] graph_embs_batch = graph_embs_t[batch_index * 15:(batch_index + 1) * 15] test_batch = batch_utils.seq_pad_concat(test_batch, -1) try: pred, acc = self._predict_batch(cm, test_batch, addn_batch, ek_batch, graph_embs_batch) except: continue accuracy_list.append(acc) pr_curve_data.append( (F.softmax(pred, -1)[:, 1].data, test_batch.labels.data)) accuracy = 100 * (sum(accuracy_list) / len(test_data)) return cm, accuracy, pr_curve_data def _train(self, train_data, dev_data, unlabel_data, addn_data, addn_data_unlab, addn_dev, ek, ek_t, ek_u, graph_embs, graph_embs_t, graph_embs_u): addn_data = addn_data.cuda() addn_data_unlab = addn_data_unlab.cuda() addn_dev = addn_dev.cuda() ek = ek.cuda() ek_t = ek_t.cuda() ek_u = ek_u.cuda() graph_embs = graph_embs.cuda() graph_embs_t = graph_embs_t.cuda() graph_embs_u = graph_embs_u.cuda() # for early stopping nepoch_no_imprv = 0 epoch_start = self.epoch + 1 epoch_end = self.epoch + self.config.nepochs + 1 for self.epoch in range(epoch_start, epoch_end): self.logger.info("Epoch {:} out of {:}".format( self.epoch, self.config.nepochs)) # random.shuffle(train_data) # random.shuffle(unlabel_data) accuracy = self._run_epoch(train_data, dev_data, unlabel_data, addn_data, addn_data_unlab, addn_dev, ek, ek_t, ek_u, graph_embs, graph_embs_t, graph_embs_u) # early stopping and saving best parameters if accuracy > self.best_accuracy: nepoch_no_imprv = 0 self.best_accuracy = accuracy self.logger.info("- new best score!") self._save_model() else: nepoch_no_imprv += 1 if nepoch_no_imprv >= self.config.nepoch_no_imprv: self.logger.info( "- early stopping {} epochs without improvement". format(nepoch_no_imprv)) break if self.config.scheduler == "ReduceLROnPlateau": self.scheduler.step(accuracy) def _evaluate(self, test_data, addn_test, ek_t, graph_embs_t, addn_test_fr=None, addn_test_f=None, addn_test_r=None, mode="train"): addn_test = addn_test.cuda() ek_t = ek_t.cuda() graph_embs_t = graph_embs_t.cuda() self.logger.info("Evaluating model over test set") if (mode == "train"): self._load_model() _, accuracy, prc_test = self._run_evaluate(test_data, addn_test, ek_t, graph_embs_t) if self.config.behaviour_test: addn_test_fr = addn_test_fr.cuda() addn_test_f = addn_test_f.cuda() addn_test_r = addn_test_r.cuda() _, accuracy_fr, prc_test_fr = self._run_evaluate( test_data, addn_test_fr, ek_t, graph_embs_t) _, accuracy_f, prc_test_f = self._run_evaluate( test_data, addn_test_f, ek_t, graph_embs_t) _, accuracy_r, prc_test_r = self._run_evaluate( test_data, addn_test_r, ek_t, graph_embs_t) pred_, lab_ = zip(*prc_test) pred_ = torch.cat(pred_).cpu().tolist() lab_ = torch.cat(lab_).cpu().tolist() path_ = os.path.join(self.config.output_path, "{}_pred_gt.tsv".format(self.config.exp_name)) with open(path_, 'w') as fp: for p, l in zip(pred_, lab_): fp.write(str(p) + '\t' + str(l) + '\n') self.logger.info("- test accuracy {}".format(accuracy)) pred_ = [round(pred_[i]) for i in range(len(pred_))] print('Normal Test Set: ', confusion_matrix(lab_, pred_)) if self.config.behaviour_test: self.logger.info( "- behaviour test accuracy - fr {}".format(accuracy_fr)) self.logger.info( "- behaviour test accuracy - f {}".format(accuracy_f)) self.logger.info( "- behaviour test accuracy - r {}".format(accuracy_r))
def main(): # Args parser = argparse.ArgumentParser() parser.add_argument('--net', type=str, help='Net model class', required=True) parser.add_argument('--traindb', type=str, help='Training datasets', nargs='+', choices=split.available_datasets, required=True) parser.add_argument('--valdb', type=str, help='Validation datasets', nargs='+', choices=split.available_datasets, required=True) parser.add_argument('--face', type=str, help='Face crop or scale', required=True, choices=['scale', 'tight']) parser.add_argument('--size', type=int, help='Train patch size', required=True) parser.add_argument('--batch', type=int, help='Batch size to fit in GPU memory', default=32) parser.add_argument('--lr', type=float, default=1e-5, help='Learning rate') parser.add_argument('--valint', type=int, help='Validation interval (iterations)', default=500) parser.add_argument( '--patience', type=int, help='Patience before dropping the LR [validation intervals]', default=10) parser.add_argument('--maxiter', type=int, help='Maximum number of iterations', default=20000) parser.add_argument('--init', type=str, help='Weight initialization file') parser.add_argument('--scratch', action='store_true', help='Train from scratch') parser.add_argument('--trainsamples', type=int, help='Limit the number of train samples per epoch', default=-1) parser.add_argument( '--valsamples', type=int, help='Limit the number of validation samples per epoch', default=6000) parser.add_argument('--logint', type=int, help='Training log interval (iterations)', default=100) parser.add_argument('--workers', type=int, help='Num workers for data loaders', default=6) parser.add_argument('--device', type=int, help='GPU device id', default=0) parser.add_argument('--seed', type=int, help='Random seed', default=0) parser.add_argument('--debug', action='store_true', help='Activate debug') parser.add_argument('--suffix', type=str, help='Suffix to default tag') parser.add_argument('--attention', action='store_true', help='Enable Tensorboard log of attention masks') parser.add_argument('--log_dir', type=str, help='Directory for saving the training logs', default='runs/binclass/') parser.add_argument('--models_dir', type=str, help='Directory for saving the models weights', default='weights/binclass/') args = parser.parse_args() # Parse arguments net_class = getattr(fornet, args.net) train_datasets = args.traindb val_datasets = args.valdb face_policy = args.face face_size = args.size batch_size = args.batch initial_lr = args.lr validation_interval = args.valint patience = args.patience max_num_iterations = args.maxiter initial_model = args.init train_from_scratch = args.scratch max_train_samples = args.trainsamples max_val_samples = args.valsamples log_interval = args.logint num_workers = args.workers device = torch.device('cuda:{:d}'.format( args.device)) if torch.cuda.is_available() else torch.device('cpu') seed = args.seed debug = args.debug suffix = args.suffix enable_attention = args.attention weights_folder = args.models_dir logs_folder = args.log_dir # Random initialization np.random.seed(seed) torch.random.manual_seed(seed) # Load net net: nn.Module = net_class().to(device) # Loss and optimizers criterion = nn.BCEWithLogitsLoss() min_lr = initial_lr * 1e-5 optimizer = optim.Adam(net.get_trainable_parameters(), lr=initial_lr) lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer=optimizer, mode='min', factor=0.1, patience=patience, cooldown=2 * patience, min_lr=min_lr, ) tag = utils.make_train_tag( net_class=net_class, traindb=train_datasets, face_policy=face_policy, patch_size=face_size, seed=seed, suffix=suffix, debug=debug, ) # Model checkpoint paths bestval_path = os.path.join(weights_folder, tag, 'bestval.pth') last_path = os.path.join(weights_folder, tag, 'last.pth') periodic_path = os.path.join(weights_folder, tag, 'it{:06d}.pth') os.makedirs(os.path.join(weights_folder, tag), exist_ok=True) # Load model val_loss = min_val_loss = 10 epoch = iteration = 0 net_state = None opt_state = None if initial_model is not None: # If given load initial model print('Loading model form: {}'.format(initial_model)) state = torch.load(initial_model, map_location='cpu') net_state = state['net'] elif not train_from_scratch and os.path.exists(last_path): print('Loading model form: {}'.format(last_path)) state = torch.load(last_path, map_location='cpu') net_state = state['net'] opt_state = state['opt'] iteration = state['iteration'] + 1 epoch = state['epoch'] if not train_from_scratch and os.path.exists(bestval_path): state = torch.load(bestval_path, map_location='cpu') min_val_loss = state['val_loss'] if net_state is not None: incomp_keys = net.load_state_dict(net_state, strict=False) print(incomp_keys) if opt_state is not None: for param_group in opt_state['param_groups']: param_group['lr'] = initial_lr optimizer.load_state_dict(opt_state) # Initialize Tensorboard logdir = os.path.join(logs_folder, tag) if iteration == 0: # If training from scratch or initialization remove history if exists shutil.rmtree(logdir, ignore_errors=True) # TensorboardX instance tb = SummaryWriter(logdir=logdir) if iteration == 0: dummy = torch.randn((1, 3, face_size, face_size), device=device) dummy = dummy.to(device) with warnings.catch_warnings(): warnings.simplefilter("ignore") tb.add_graph(net, [ dummy, ], verbose=False) transformer = utils.get_transformer(face_policy=face_policy, patch_size=face_size, net_normalizer=net.get_normalizer(), train=True) # Datasets and data loaders print('Loading data') splits = split.make_splits(dbs={ 'train': train_datasets, 'val': val_datasets }) train_dfs = [splits['train'][db][0] for db in splits['train']] train_roots = [splits['train'][db][1] for db in splits['train']] val_roots = [splits['val'][db][1] for db in splits['val']] val_dfs = [splits['val'][db][0] for db in splits['val']] train_dataset = FrameFaceIterableDataset( roots=train_roots, dfs=train_dfs, scale=face_policy, num_samples=max_train_samples, transformer=transformer, size=face_size, ) val_dataset = FrameFaceIterableDataset( roots=val_roots, dfs=val_dfs, scale=face_policy, num_samples=max_val_samples, transformer=transformer, size=face_size, ) train_loader = DataLoader( train_dataset, num_workers=num_workers, batch_size=batch_size, ) val_loader = DataLoader( val_dataset, num_workers=num_workers, batch_size=batch_size, ) print('Training samples: {}'.format(len(train_dataset))) print('Validation samples: {}'.format(len(val_dataset))) if len(train_dataset) == 0: print('No training samples. Halt.') return if len(val_dataset) == 0: print('No validation samples. Halt.') return stop = False while not stop: # Training optimizer.zero_grad() train_loss = train_num = 0 train_pred_list = [] train_labels_list = [] for train_batch in tqdm(train_loader, desc='Epoch {:03d}'.format(epoch), leave=False, total=len(train_loader) // train_loader.batch_size): net.train() batch_data, batch_labels = train_batch train_batch_num = len(batch_labels) train_num += train_batch_num train_labels_list.append(batch_labels.numpy().flatten()) train_batch_loss, train_batch_pred = batch_forward( net, device, criterion, batch_data, batch_labels) train_pred_list.append(train_batch_pred.flatten()) if torch.isnan(train_batch_loss): raise ValueError('NaN loss') train_loss += train_batch_loss.item() * train_batch_num # Optimization train_batch_loss.backward() optimizer.step() optimizer.zero_grad() # Logging if iteration > 0 and (iteration % log_interval == 0): train_loss /= train_num tb.add_scalar('train/loss', train_loss, iteration) tb.add_scalar('lr', optimizer.param_groups[0]['lr'], iteration) tb.add_scalar('epoch', epoch, iteration) # Checkpoint save_model(net, optimizer, train_loss, val_loss, iteration, batch_size, epoch, last_path) train_loss = train_num = 0 # Validation if iteration > 0 and (iteration % validation_interval == 0): # Model checkpoint save_model(net, optimizer, train_loss, val_loss, iteration, batch_size, epoch, periodic_path.format(iteration)) # Train cumulative stats train_labels = np.concatenate(train_labels_list) train_pred = np.concatenate(train_pred_list) train_labels_list = [] train_pred_list = [] train_roc_auc = roc_auc_score(train_labels, train_pred) tb.add_scalar('train/roc_auc', train_roc_auc, iteration) tb.add_pr_curve('train/pr', train_labels, train_pred, iteration) # Validation val_loss = validation_routine(net, device, val_loader, criterion, tb, iteration, 'val') tb.flush() # LR Scheduler lr_scheduler.step(val_loss) # Model checkpoint if val_loss < min_val_loss: min_val_loss = val_loss save_model(net, optimizer, train_loss, val_loss, iteration, batch_size, epoch, bestval_path) # Attention if enable_attention and hasattr(net, 'get_attention'): net.eval() # For each dataframe show the attention for a real,fake couple of frames for df, root, sample_idx, tag in [ (train_dfs[0], train_roots[0], train_dfs[0][train_dfs[0]['label'] == False].index[0], 'train/att/real'), (train_dfs[0], train_roots[0], train_dfs[0][train_dfs[0]['label'] == True].index[0], 'train/att/fake'), ]: record = df.loc[sample_idx] tb_attention(tb, tag, iteration, net, device, face_size, face_policy, transformer, root, record) if optimizer.param_groups[0]['lr'] == min_lr: print('Reached minimum learning rate. Stopping.') stop = True break iteration += 1 if iteration > max_num_iterations: print('Maximum number of iterations reached') stop = True break # End of iteration epoch += 1 # Needed to flush out last events tb.close() print('Completed')
columns=['false', 'true']) sns.heatmap(cm_df, annot=True, fmt="d") plt.title('Accuracy:{0:.3f}'.format(acc)) plt.ylabel('True label') plt.xlabel('Predicted label') print(cm) train_writer.add_scalar("pr_auc", pr_auc, (epoch * num_steps) + step) train_writer.add_scalar("roc_auc", roc_auc, (epoch * num_steps) + step) train_writer.add_figure("roc_curve", fig, (epoch * num_steps) + step) train_writer.add_figure("cm", fig_cm, (epoch * num_steps) + step) train_writer.add_pr_curve("pr_curve", labels[start_ind:end_ind], predict_train, (epoch * num_steps) + step) if ((epoch * num_steps) + step) % dev_print_gap == dev_print_gap - 1: l2s_test, labels_test = generate_data( dataset=dataset, data_index=dev_index, flow_size=flow_size, negetive_samples=negetive_samples_test) test_time_before = time.time() tp = 0 fp = 0 loss_sum = 0 num_steps_test = (len(l2s_test) // batch_size) - 1 Y_est = np.zeros((batch_size * (num_steps_test + 1), 1),