def evaluate(model, criterion, data_loader, device): model.eval() metric_logger = MetricLogger(delimiter=" ") header = 'Test:' with torch.no_grad(): for video, target in metric_logger.log_every(data_loader, 100, header): start_time = time.time() video = video.to(device, non_blocking=True) target = target.to(device, non_blocking=True) output = model(video) time_diff = time.time() - start_time print("Predicting on a video of shape {} took {} seconds".format( video.shape, time_diff)) print("target shape {}".format(target.shape)) print("target {}".format(target)) loss = criterion(output, target) acc1, acc5 = accuracy(output, target, topk=(1, 5)) # FIXME need to take into account that the datasets # could have been padded in distributed setup batch_size = video.shape[0] metric_logger.update(loss=loss.item()) metric_logger.meters['acc1'].update(acc1.item(), n=batch_size) metric_logger.meters['acc5'].update(acc5.item(), n=batch_size) print( ' * Clip Acc@1 {top1.global_avg:.3f} Clip Acc@5 {top5.global_avg:.3f}'. format(top1=metric_logger.acc1, top5=metric_logger.acc5)) return metric_logger.acc1.global_avg
def evaluate(model, epoch, criterion, data_loader, device, writer): model.eval() metric_logger = MetricLogger(delimiter=" ") header = 'Test:' cntr = 0 running_accuracy = 0.0 with torch.no_grad(): for video, target in metric_logger.log_every(data_loader, 100, header): video = video.to(device, non_blocking=True) target = target.to(device, non_blocking=True) output = model(video) loss = criterion(output, target) acc1, acc5 = accuracy(output, target, topk=(1, 5)) # FIXME need to take into account that the datasets # could have been padded in distributed setup batch_size = video.shape[0] running_accuracy += acc1.item() if cntr % 10 == 9: # average loss over the accumulated mini-batch writer.add_scalar('validation accuracy', running_accuracy / 10, epoch * len(data_loader) + cntr) running_accuracy = 0.0 cntr += 1 metric_logger.update(loss=loss.item()) metric_logger.meters['acc1'].update(acc1.item(), n=batch_size) metric_logger.meters['acc5'].update(acc5.item(), n=batch_size) print(' * Clip Acc@1 {top1.global_avg:.3f} Clip Acc@5 {top5.global_avg:.3f}' .format(top1=metric_logger.acc1, top5=metric_logger.acc5)) return metric_logger.acc1.global_avg
def train_one_epoch(model, optimizer, lr_scheduler, data_loader, epoch, print_freq, checkpoint_fn=None): model.train() metric_logger = MetricLogger(delimiter=" ") metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value}')) metric_logger.add_meter('batch/s', SmoothedValue(window_size=10, fmt='{value:.3f}')) header = 'Epoch: [{}]'.format(epoch) for step, batched_inputs in enumerate( metric_logger.log_every(data_loader, print_freq, header)): start_time = time.time() loss = model(batched_inputs) if checkpoint_fn is not None and np.random.random() < 0.005: checkpoint_fn() optimizer.zero_grad() loss.backward() optimizer.step() metric_logger.update(loss=loss.item(), lr=optimizer.param_groups[0]["lr"]) metric_logger.meters['batch/s'].update((time.time() - start_time)) lr_scheduler.step() if checkpoint_fn is not None: checkpoint_fn()
def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch): model.train() sums = defaultdict(lambda: 0.0) start1 = time() metric = MetricLogger("train_iteration") metric["epoch"] = epoch for waveform, specgram, target in bg_iterator(data_loader, maxsize=2): start2 = time() waveform = waveform.to(device) specgram = specgram.to(device) target = target.to(device) output = model(waveform, specgram) output, target = output.squeeze(1), target.squeeze(1) loss = criterion(output, target) loss_item = loss.item() sums["loss"] += loss_item metric["loss"] = loss_item optimizer.zero_grad() loss.backward() if args.clip_grad > 0: gradient = torch.nn.utils.clip_grad_norm_( model.parameters(), args.clip_grad ) sums["gradient"] += gradient.item() metric["gradient"] = gradient.item() optimizer.step() metric["iteration"] = sums["iteration"] metric["time"] = time() - start2 metric() sums["iteration"] += 1 avg_loss = sums["loss"] / len(data_loader) metric = MetricLogger("train_epoch") metric["epoch"] = epoch metric["loss"] = sums["loss"] / len(data_loader) metric["gradient"] = avg_loss metric["time"] = time() - start1 metric()
def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch): model.train() sums = defaultdict(lambda: 0.0) start1 = time() metric = MetricLogger("train_iteration") metric["epoch"] = epoch for i, batch in enumerate(data_loader): start2 = time() adjust_learning_rate(epoch, optimizer, args.learning_rate, args.anneal_steps, args.anneal_factor) model.zero_grad() x, y, _ = batch_to_gpu(batch) y_pred = model(x) loss = criterion(y_pred, y) loss_item = loss.item() sums["loss"] += loss_item metric["loss"] = loss_item optimizer.zero_grad() loss.backward() if args.clip_grad > 0: gradient = torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad) sums["gradient"] += gradient.item() metric["gradient"] = gradient.item() optimizer.step() metric["iteration"] = sums["iteration"] metric["time"] = time() - start2 metric() sums["iteration"] += 1 avg_loss = sums["loss"] / len(data_loader) metric = MetricLogger("train_epoch") metric["epoch"] = epoch metric["loss"] = sums["loss"] / len(data_loader) metric["gradient"] = avg_loss metric["time"] = time() - start1 metric()
def validate(model, criterion, data_loader, device, epoch): with torch.no_grad(): model.eval() sums = defaultdict(lambda: 0.0) start = time() for i, batch in enumerate(data_loader): start2 = time() x, y, _ = batch_to_gpu(batch) y_pred = model(x) loss = criterion(y_pred, y) loss_item = loss.item() sums["loss"] += loss_item avg_loss = sums["loss"] / len(data_loader) metric = MetricLogger("validation") metric["epoch"] = epoch metric["loss"] = avg_loss metric["time"] = time() - start metric() return avg_loss
def validate(model, criterion, data_loader, device, epoch): with torch.no_grad(): model.eval() sums = defaultdict(lambda: 0.0) start = time() for waveform, specgram, target in bg_iterator(data_loader, maxsize=2): waveform = waveform.to(device) specgram = specgram.to(device) target = target.to(device) output = model(waveform, specgram) output, target = output.squeeze(1), target.squeeze(1) loss = criterion(output, target) sums["loss"] += loss.item() avg_loss = sums["loss"] / len(data_loader) metric = MetricLogger("validation") metric["epoch"] = epoch metric["loss"] = avg_loss metric["time"] = time() - start metric() return avg_loss
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq): model.train() metric_logger = MetricLogger(delimiter=" ") metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for images, targets in metric_logger.log_every(data_loader, print_freq, header): images = list(image.to(device) for image in images) #images = list(np.array(img) for img in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) return metric_logger
def train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, device, epoch, print_freq, writer): model.train() metric_logger = MetricLogger(delimiter=" ") metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value}')) metric_logger.add_meter('clips/s', SmoothedValue(window_size=10, fmt='{value:.3f}')) running_loss = 0.0 running_accuracy = 0.0 header = 'Epoch: [{}]'.format(epoch) cntr = 0 for video, target in metric_logger.log_every(data_loader, print_freq, header): start_time = time.time() video, target = video.to(device), target.to(device) output = model(video) loss = criterion(output, target) optimizer.zero_grad() loss.backward() optimizer.step() acc1, acc5 = accuracy(output, target, topk=(1, 5)) batch_size = video.shape[0] running_loss += loss.item() running_accuracy += acc1.item() if cntr % 10 == 9: #average loss over the accumulated mini-batch writer.add_scalar('training loss', running_loss / 10, epoch * len(data_loader) + cntr) writer.add_scalar('learning rate', optimizer.param_groups[0]["lr"], epoch * len(data_loader) + cntr) writer.add_scalar('accuracy', running_accuracy / 10, epoch * len(data_loader) + cntr) running_loss = 0.0 running_accuracy = 0.0 cntr = cntr + 1 metric_logger.update(loss=loss.item(), lr=optimizer.param_groups[0]["lr"]) metric_logger.meters['acc1'].update(acc1.item(), n=batch_size) metric_logger.meters['acc5'].update(acc5.item(), n=batch_size) metric_logger.meters['clips/s'].update(batch_size / (time.time() - start_time)) lr_scheduler.step()
def evaluate( model, criterion, data_loader, decoder, language_model, device, epoch, disable_logger=False, ): with torch.no_grad(): model.eval() start = time() metric = MetricLogger("validation", disable=disable_logger) metric["epoch"] = epoch for inputs, targets, tensors_lengths, target_lengths in bg_iterator( data_loader, maxsize=2): inputs = inputs.to(device, non_blocking=True) targets = targets.to(device, non_blocking=True) # keep batch first for data parallel outputs = model(inputs).transpose(-1, -2).transpose(0, 1) # CTC # outputs: input length, batch size, number of classes (including blank) # targets: batch size, max target length # input_lengths: batch size # target_lengths: batch size metric["cumulative loss"] += criterion(outputs, targets, tensors_lengths, target_lengths).item() metric["dataset length"] += len(inputs) metric["iteration"] += 1 compute_error_rates(outputs, targets, decoder, language_model, metric) metric[ "average loss"] = metric["cumulative loss"] / metric["iteration"] metric["validation time"] = time() - start metric() return metric["average loss"]
def train_eval_mse(self, model, valset, writer, global_step, device): """ Evaluate MSE during training """ num_samples = eval_cfg.train.num_samples.mse batch_size = eval_cfg.train.batch_size num_workers = eval_cfg.train.num_workers model.eval() if valset == None: data_set = np.load('../data/TABLE/val/all_set_val.npy') data_size = len(data_set) # creates indexes and shuffles them. So it can acces the data idx_set = np.arange(data_size) np.random.shuffle(idx_set) idx_set = idx_set[:num_samples] idx_set = np.split(idx_set, len(idx_set) / batch_size) data_to_enumerate = idx_set else: valset = Subset(valset, indices=range(num_samples)) dataloader = DataLoader(valset, batch_size=batch_size, num_workers=num_workers, shuffle=False) data_to_enumerate = dataloader metric_logger = MetricLogger() print(f'Evaluating MSE using {num_samples} samples.') with tqdm(total=num_samples) as pbar: for batch_idx, sample in enumerate(data_to_enumerate): if valset == None: data_i = data_set[sample] data_i = torch.from_numpy(data_i).float().to(device) data_i /= 255 data_i = data_i.permute([0, 3, 1, 2]) imgs = data_i else: imgs = sample.to(device) loss, log = model(imgs, global_step) B = imgs.size(0) for b in range(B): metric_logger.update(mse=log['mse'][b], ) metric_logger.update(loss=loss.mean()) pbar.update(B) assert metric_logger['mse'].count == num_samples # Add last log # log.update([(k, torch.tensor(v.global_avg)) for k, v in metric_logger.values.items()]) mse = metric_logger['mse'].global_avg writer.add_scalar(f'val/mse', mse, global_step=global_step) model.train() return mse
def evaluate(model, data_loader, device): n_threads = torch.get_num_threads() # FIXME remove this and make paste_masks_in_image run on the GPU torch.set_num_threads(1) cpu_device = torch.device("cpu") model.eval() metric_logger = MetricLogger(delimiter=" ") header = 'Test:' coco = get_coco_api_from_dataset(data_loader.dataset) iou_types = _get_iou_types(model) coco_evaluator = CocoEvaluator(coco, iou_types) for images, targets in metric_logger.log_every(data_loader, 100, header): images = list(img.to(device) for img in images) if torch.cuda.is_available(): torch.cuda.synchronize() model_time = time.time() outputs = model(images) outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs] model_time = time.time() - model_time res = { target["image_id"].item(): output for target, output in zip(targets, outputs) } evaluator_time = time.time() coco_evaluator.update(res) evaluator_time = time.time() - evaluator_time metric_logger.update(model_time=model_time, evaluator_time=evaluator_time) # gather the stats from all processes metric_logger.synchronize_between_processes() print("Averaged stats:", metric_logger) coco_evaluator.synchronize_between_processes() # accumulate predictions from all images coco_evaluator.accumulate() coco_evaluator.summarize() torch.set_num_threads(n_threads) return coco_evaluator
def train_eval_mse(self, model, valset, writer, global_step, device): """ Evaluate MSE during training """ num_samples = eval_cfg.train.num_samples.mse batch_size = eval_cfg.train.batch_size num_workers = eval_cfg.train.num_workers model.eval() # valset = Subset(valset, indices=range(num_samples)) dataloader = DataLoader(valset, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True, collate_fn=valset.collate_fn) #dataloader = DataLoader(valset, batch_size=batch_size, num_workers=num_workers, shuffle=False) metric_logger = MetricLogger() print(f'Evaluating MSE using {num_samples} samples.') n_batch = 0 with tqdm(total=num_samples) as pbar: for batch_idx, sample in enumerate(dataloader): imgs = sample[0].to(device) loss, log = model(imgs, global_step) B = imgs.size(0) for b in range(B): metric_logger.update( mse=log['mse'][b], ) metric_logger.update(loss=loss.mean()) pbar.update(1) n_batch += 1 if n_batch >= num_samples: break # assert metric_logger['mse'].count == num_samples # Add last log # log.update([(k, torch.tensor(v.global_avg)) for k, v in metric_logger.values.items()]) mse = metric_logger['mse'].global_avg writer.add_scalar(f'val/mse', mse, global_step=global_step) model.train() return mse
def train_one_epoch(train_loader, model, criterion, optimizer, writer, epoch, total_step, config): log_header = 'EPOCH {}'.format(epoch) losses = AverageMeter('Loss', fmt=':.4f') if config.method != 'byol': top1 = AverageMeter('Acc1', fmt=':4.2f') top5 = AverageMeter('Acc5', fmt=':4.2f') lr = AverageMeter('Lr', fmt=":.6f") metric_logger = MetricLogger(delimeter=" | ") metric_logger.add_meter(losses) if config.method != 'byol': metric_logger.add_meter(top1) metric_logger.add_meter(top5) metric_logger.add_meter(lr) # ce = nn.CrossEntropyLoss().cuda(config.system.gpu) # num_steps_per_epoch = int(len(train_loader.dataset) // config.train.batch_size) # global_step = num_steps_per_epoch * epoch for step, (images, _) in enumerate( metric_logger.log_every(train_loader, config.system.print_freq, log_header)): total_step.val += 1 if config.system.gpu is not None: images[0] = images[0].cuda(config.system.gpu, non_blocking=True) images[1] = images[1].cuda(config.system.gpu, non_blocking=True) # [pos, neg] # output = model(view_1=images[0], view_2=images[1]) # loss, logits, targets = criterion(output) if config.method != 'byol': logits, targets, logits_original = model(view_1=images[0], view_2=images[1]) loss = criterion(logits, targets) acc1, acc5 = accuracy(logits_original, targets, topk=(1, 5)) else: loss_pre = model(view_1=images[0], view_2=images[1]) loss = loss_pre.mean() lr_ = optimizer.param_groups[0]['lr'] if config.method != 'byol': metric_logger.update(Loss=loss.detach().cpu().item(), Acc1=acc1.detach().cpu().item(), Acc5=acc5.detach().cpu().item(), Lr=lr_) else: metric_logger.update(Loss=loss.detach().cpu().item(), Lr=lr_) writer.add_scalar('loss', loss.detach().cpu().item(), total_step.val) if config.method != 'byol': writer.add_scalar('top1', acc1.detach().cpu().item(), total_step.val) optimizer.zero_grad() loss.backward() optimizer.step()
def train(proc_id, n_gpus, args, devices, movielens): # Start up distributed training, if enabled. dev_id = devices[proc_id] if n_gpus > 1: dist_init_method = 'tcp://{master_ip}:{master_port}'.format( master_ip='127.0.0.1', master_port='12345') world_size = n_gpus th.distributed.init_process_group(backend="nccl", init_method=dist_init_method, world_size=world_size, rank=proc_id) th.cuda.set_device(dev_id) # set random seed in each gpu th.manual_seed(args.seed) if th.cuda.is_available(): th.cuda.manual_seed_all(args.seed) # Split train_dataset and set dataloader train_rating_pairs = th.split(th.stack(movielens.train_rating_pairs), len(movielens.train_rating_values) // args.n_gpus, dim=1)[proc_id] train_rating_values = th.split(movielens.train_rating_values, len(movielens.train_rating_values) // args.n_gpus, dim=0)[proc_id] train_dataset = MovieLensDataset(train_rating_pairs, train_rating_values, movielens.train_graph, args.hop, args.sample_ratio, args.max_nodes_per_hop) train_loader = th.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_movielens) if proc_id == 0: if args.testing: test_dataset = MovieLensDataset(movielens.test_rating_pairs, movielens.test_rating_values, movielens.train_graph, args.hop, args.sample_ratio, args.max_nodes_per_hop) else: test_dataset = MovieLensDataset(movielens.valid_rating_pairs, movielens.valid_rating_pairs, movielens.train_graph, args.hop, args.sample_ratio, args.max_nodes_per_hop) test_loader = th.utils.data.DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_movielens) model = IGMC( in_feats=(args.hop + 1) * 2, latent_dim=[32, 32, 32, 32], num_relations=5, #dataset_base.num_rating, num_bases=4, regression=True, edge_dropout=args.edge_dropout, # side_features=args.use_features, # n_side_features=n_features, # multiply_by=args.multiply_by ).to(dev_id) if n_gpus > 1: model = DistributedDataParallel(model, device_ids=[dev_id], output_device=dev_id) loss_fn = nn.MSELoss().to(dev_id) optimizer = optim.Adam(model.parameters(), lr=args.train_lr, weight_decay=0) if proc_id == 0: print("Loading network finished ...\n") # prepare the logger logger = MetricLogger(args.save_dir, args.valid_log_interval) best_epoch = 0 best_rmse = np.inf print("Start training ...") for epoch_idx in range(1, args.train_epochs + 1): if proc_id == 0: print('Epoch', epoch_idx) train_loss = train_epoch(proc_id, n_gpus, model, loss_fn, optimizer, args.arr_lambda, train_loader, dev_id, args.train_log_interval) if n_gpus > 1: th.distributed.barrier() if proc_id == 0: test_rmse = evaluate(model, test_loader, dev_id) eval_info = { 'epoch': epoch_idx, 'train_loss': train_loss, 'test_rmse': test_rmse, } print( '=== Epoch {}, train loss {:.6f}, test rmse {:.6f} ==='.format( *eval_info.values())) if epoch_idx % args.train_lr_decay_step == 0: for param in optimizer.param_groups: param['lr'] = args.train_lr_decay_factor * param['lr'] logger.log(eval_info, model, optimizer) if best_rmse > test_rmse: best_rmse = test_rmse best_epoch = epoch_idx if n_gpus > 1: th.distributed.barrier() if proc_id == 0: eval_info = "Training ends. The best testing rmse is {:.6f} at epoch {}".format( best_rmse, best_epoch) print(eval_info) with open(os.path.join(args.save_dir, 'log.txt'), 'a') as f: f.write(eval_info)
def train(args): print(args) dataset = DataSetLoader(args.data_name, args.device, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm, test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio, sample_rate=args.sample_rate) print("Loading data finished ...\n") args.src_in_units = dataset.user_feature_shape[1] args.dst_in_units = dataset.movie_feature_shape[1] args.rating_vals = dataset.possible_rating_values ### build the net #args.decoder = "MLP" net = Net(args=args) #print(args) net = net.to(args.device) nd_possible_rating_values = th.FloatTensor( dataset.possible_rating_values).to(args.device) rating_loss_net = nn.CrossEntropyLoss() learning_rate = args.train_lr optimizer = get_optimizer(args.train_optimizer)(net.parameters(), lr=learning_rate) print("Loading network finished ...\n") ### perpare training data train_gt_labels = dataset.train_labels train_gt_ratings = dataset.train_truths ### prepare the logger train_loss_logger = MetricLogger( ['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'], os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id)) valid_loss_logger = MetricLogger( ['iter', 'rmse', "ndcg_20", "ndcg_40", "ndcg_80"], ['%d', '%.4f', '%.4f', '%.4f', '%.4f'], os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id)) test_loss_logger = MetricLogger( ['iter', 'rmse', "ndcg_20", "ndcg_40", "ndcg_80"], ['%d', '%.4f', '%.4f', '%.4f', '%.4f'], os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id)) ### declare the loss information best_valid_rmse = np.inf best_valid_ndcg = -np.inf best_test_ndcg = -np.inf no_better_valid = 0 best_iter = -1 count_rmse = 0 count_num = 0 count_loss = 0 dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device) dataset.train_dec_graph = dataset.train_dec_graph.int().to(args.device) dataset.valid_enc_graph = dataset.train_enc_graph dataset.valid_dec_graph = dataset.valid_dec_graph.int().to(args.device) dataset.valid_recall_dec_graph = dataset.valid_recall_dec_graph.to( args.device) dataset.test_enc_graph = dataset.test_enc_graph.int().to(args.device) dataset.test_dec_graph = dataset.test_dec_graph.int().to(args.device) dataset.test_recall_dec_graph = dataset.test_recall_dec_graph.to( args.device) print("Start training ...") dur = [] for iter_idx in range(1, args.train_max_iter): ''' noisy_labels = th.LongTensor(np.random.choice([-1, 0, 1], train_gt_ratings.shape[0], replace=True, p=[0.001, 0.998, 0.001])).to(args.device) train_gt_labels += noisy_labels max_label = dataset.max_l + th.zeros_like(train_gt_labels) min_label = dataset.min_l + th.zeros_like(train_gt_labels) max_label = max_label.long() min_label = min_label.long() train_gt_labels = th.where(train_gt_labels > max_label, max_label, train_gt_labels) train_gt_labels = th.where(train_gt_labels < min_label, min_label, train_gt_labels) ''' if iter_idx > 3: t0 = time.time() net.train() if iter_idx > 250: Two_Stage = True else: Two_Stage = False Two_Stage = False pred_ratings, reg_loss, user_out, movie_out, W = net( dataset.train_enc_graph, dataset.train_dec_graph, dataset.user_feature, dataset.movie_feature, Two_Stage) #print("user_out:\n", user_out[0]) #print("movie_out:\n", movie_out[0]) #print("W:\n", W.shape) if args.loss_func == "CE": loss = rating_loss_net( pred_ratings, train_gt_labels).mean() + args.ARR * reg_loss ''' real_pred_ratings = (th.softmax(pred_ratings, dim=1) * nd_possible_rating_values.view(1, -1)).sum(dim=1) mse_loss = th.sum((real_pred_ratings - train_gt_ratings) ** 2) loss += mse_loss * 0.0001 ''' elif args.loss_func == "Hinge": real_pred_ratings = (th.softmax(pred_ratings, dim=1) * nd_possible_rating_values.view(1, -1)).sum( dim=1) gap = (real_pred_ratings - train_gt_labels)**2 hinge_loss = th.where(gap > 1.0, gap * gap, gap).mean() loss = hinge_loss elif args.loss_func == "MSE": ''' seeds = th.arange(pred_ratings.shape[0]) random.shuffle(seeds) for i in range((pred_ratings.shape[0] - 1) // 50 + 1): start = i * 50 end = (i + 1) * 50 if end > (pred_ratings.shape[0] - 1): end = pred_ratings.shape[0] - 1 batch = seeds[start:end] loss = F.mse_loss(pred_ratings[batch, 0], nd_possible_rating_values[train_gt_labels[batch]]) + args.ARR * reg_loss count_loss += loss.item() * 50 / pred_ratings.shape[0] optimizer.zero_grad() loss.backward(retain_graph=True) #nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip) optimizer.step() pred_ratings, reg_loss = net(dataset.train_enc_graph, dataset.train_dec_graph, dataset.user_feature, dataset.movie_feature) ''' loss = th.mean((pred_ratings[:, 0] - nd_possible_rating_values[train_gt_labels])** 2) + args.ARR * reg_loss count_loss += loss.item() optimizer.zero_grad() loss.backward(retain_graph=True) nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip) optimizer.step() if iter_idx > 3: dur.append(time.time() - t0) if iter_idx == 1: print("Total #Param of net: %d" % (torch_total_param_num(net))) print( torch_net_info(net, save_path=os.path.join( args.save_dir, 'net%d.txt' % args.save_id))) if args.loss_func == "CE": real_pred_ratings = (th.softmax(pred_ratings, dim=1) * nd_possible_rating_values.view(1, -1)).sum( dim=1) elif args.loss_func == "MSE": real_pred_ratings = pred_ratings[:, 0] rmse = ((real_pred_ratings - train_gt_ratings)**2).sum() count_rmse += rmse.item() count_num += pred_ratings.shape[0] if iter_idx % args.train_log_interval == 0: train_loss_logger.log(iter=iter_idx, loss=count_loss / (iter_idx + 1), rmse=count_rmse / count_num) logging_str = "Iter={}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format( iter_idx, count_loss / iter_idx, count_rmse / count_num, np.average(dur)) count_rmse = 0 count_num = 0 if iter_idx % args.train_valid_interval == 0: valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid') ndcg_valid = evaluate_metric(args=args, net=net, dataset=dataset, segment='valid', debug=False) print("ndcg_valid:", ndcg_valid) valid_loss_logger.log(iter=iter_idx, rmse=valid_rmse, ndcg_20=ndcg_valid[0], ndcg_40=ndcg_valid[1], ndcg_80=ndcg_valid[2]) print("-" * 80) #test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test') #test_loss_logger.log(iter=iter_idx, rmse=test_rmse, ndcg_20 = ndcg_k[0], ndcg_40 = ndcg_k[1], ndcg_80 = ndcg_k[2]) #logging_str += ', Test RMSE={:.4f}'.format(test_rmse) logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse) logging_str += ',\tndcg_valid_20={:.4f}'.format(ndcg_valid[0]) logging_str += ',\tndcg_valid_40={:.4f}'.format(ndcg_valid[1]) logging_str += ',\tndcg_valid_80={:.4f}'.format(ndcg_valid[2]) ndcg_valid_20 = ndcg_valid[0] #print("***********",ndcg_valid_20) if ndcg_valid_20 > best_valid_ndcg: best_valid_ndcg = ndcg_valid_20 print("************best_valid_ndcg:", best_valid_ndcg) print("************ndcg_valid_20:", ndcg_valid_20) no_better_valid = 0 best_iter = iter_idx test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test', debug=True, idx=iter_idx) ndcg_test = evaluate_metric(args=args, net=net, dataset=dataset, segment='test', debug=False) logging_str += ',\tbest ndcg_test={:.4f}'.format(ndcg_test[0]) logging_str += ',\tbest ndcg_test={:.4f}'.format(ndcg_test[1]) logging_str += ',\tbest ndcg_test={:.4f}'.format(ndcg_test[2]) #best_test_rmse = test_rmse best_test_ndcg = ndcg_test #test_loss_logger.log(iter=iter_idx, rmse=test_rmse) test_loss_logger.log(iter=iter_idx, rmse=test_rmse, ndcg_20=ndcg_test[0], ndcg_40=ndcg_test[1], ndcg_80=ndcg_test[2]) #logging_str += ', Test RMSE={:.4f}'.format(test_rmse) else: no_better_valid += 1 if no_better_valid > args.train_early_stopping_patience\ and learning_rate <= args.train_min_lr: logging.info( "Early stopping threshold reached. Stop training.") break if no_better_valid > args.train_decay_patience: new_lr = max(learning_rate * args.train_lr_decay_factor, args.train_min_lr) if new_lr < learning_rate: learning_rate = new_lr logging.info("\tChange the LR to %g" % new_lr) for p in optimizer.param_groups: p['lr'] = learning_rate no_better_valid = 0 #print("************best_valid_ndcg:",best_valid_ndcg) #print("************ndcg_valid_20:",ndcg_valid_20) if iter_idx % args.train_log_interval == 0: print(logging_str) print( 'Best Iter Idx={}, best ndcg_20={:.4f}, best ndcg_40={:.4f}, best ndcg_80={:.4f}' .format(best_iter, best_test_ndcg[0], best_test_ndcg[1], best_test_ndcg[2])) train_loss_logger.close() valid_loss_logger.close() test_loss_logger.close()
def train_one_epoch( model, criterion, optimizer, scheduler, data_loader, decoder, language_model, device, epoch, clip_grad, disable_logger=False, reduce_lr_on_plateau=False, ): model.train() metric = MetricLogger("train", disable=disable_logger) metric["epoch"] = epoch for inputs, targets, tensors_lengths, target_lengths in bg_iterator( data_loader, maxsize=2): start = time() inputs = inputs.to(device, non_blocking=True) targets = targets.to(device, non_blocking=True) # keep batch first for data parallel outputs = model(inputs).transpose(-1, -2).transpose(0, 1) # CTC # outputs: input length, batch size, number of classes (including blank) # targets: batch size, max target length # input_lengths: batch size # target_lengths: batch size loss = criterion(outputs, targets, tensors_lengths, target_lengths) optimizer.zero_grad() loss.backward() if clip_grad > 0: metric["gradient"] = torch.nn.utils.clip_grad_norm_( model.parameters(), clip_grad) optimizer.step() compute_error_rates(outputs, targets, decoder, language_model, metric) try: metric["lr"] = scheduler.get_last_lr()[0] except AttributeError: metric["lr"] = optimizer.param_groups[0]["lr"] metric["batch size"] = len(inputs) metric["n_channel"] = inputs.shape[1] metric["n_time"] = inputs.shape[-1] metric["dataset length"] += metric["batch size"] metric["iteration"] += 1 metric["loss"] = loss.item() metric["cumulative loss"] += metric["loss"] metric[ "average loss"] = metric["cumulative loss"] / metric["iteration"] metric["iteration time"] = time() - start metric["epoch time"] += metric["iteration time"] metric() if reduce_lr_on_plateau and isinstance(scheduler, ReduceLROnPlateau): scheduler.step(metric["average loss"]) elif not isinstance(scheduler, ReduceLROnPlateau): scheduler.step()
def train(cfg): torch.manual_seed(cfg.seed) np.random.seed(cfg.seed) torch.manual_seed(cfg.seed) torch.cuda.manual_seed(cfg.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Some info print('Experiment name:', cfg.exp_name) print('Model name:', cfg.model) print('Dataset:', cfg.dataset) print('Resume:', cfg.resume) if cfg.resume: print('Checkpoint:', cfg.resume_ckpt if cfg.resume_ckpt else 'last checkpoint') print('Using device:', cfg.device) if 'cuda' in cfg.device: print('Using parallel:', cfg.parallel) if cfg.parallel: print('Device ids:', cfg.device_ids) print('\nLoading data...') trainloader = get_dataloader(cfg, 'train') if cfg.val.ison or cfg.vis.ison: valset = get_dataset(cfg, 'val') valloader = get_dataloader(cfg, 'val') print('Data loaded.') print('Initializing model...') model = get_model(cfg) model = model.to(cfg.device) print('Model initialized.') model.train() optimizer = get_optimizer(cfg, model) # Checkpointer will print information. checkpointer = Checkpointer(os.path.join(cfg.checkpointdir, cfg.exp_name), max_num=cfg.train.max_ckpt) start_epoch = 0 start_iter = 0 global_step = 0 if cfg.resume: checkpoint = checkpointer.load(cfg.resume_ckpt, model, optimizer) if checkpoint: start_epoch = checkpoint['epoch'] global_step = checkpoint['global_step'] + 1 if cfg.parallel: model = nn.DataParallel(model, device_ids=cfg.device_ids) writer = SummaryWriter(log_dir=os.path.join(cfg.logdir, cfg.exp_name), purge_step=global_step, flush_secs=30) metric_logger = MetricLogger() vis_logger = get_vislogger(cfg) evaluator = get_evaluator(cfg) print('Start training') end_flag = False for epoch in range(start_epoch, cfg.train.max_epochs): if end_flag: break start = time.perf_counter() for i, data in enumerate(trainloader): end = time.perf_counter() data_time = end - start start = end imgs, *_ = [d.to(cfg.device) for d in data] model.train() loss, log = model(imgs, global_step) # If you are using DataParallel loss = loss.mean() optimizer.zero_grad() loss.backward() if cfg.train.clip_norm: clip_grad_norm_(model.parameters(), cfg.train.clip_norm) optimizer.step() end = time.perf_counter() batch_time = end - start metric_logger.update(data_time=data_time) metric_logger.update(batch_time=batch_time) metric_logger.update(loss=loss.item()) if (global_step + 1) % cfg.train.print_every == 0: start = time.perf_counter() log.update(loss=metric_logger['loss'].median) vis_logger.model_log_vis(writer, log, global_step + 1) end = time.perf_counter() device_text = cfg.device_ids if cfg.parallel else cfg.device print( 'exp: {}, device: {}, epoch: {}, iter: {}/{}, global_step: {}, loss: {:.2f}, batch time: {:.4f}s, data time: {:.4f}s, log time: {:.4f}s' .format(cfg.exp_name, device_text, epoch + 1, i + 1, len(trainloader), global_step + 1, metric_logger['loss'].median, metric_logger['batch_time'].avg, metric_logger['data_time'].avg, end - start)) if (global_step + 1) % cfg.train.save_every == 0: start = time.perf_counter() checkpointer.save(model, optimizer, epoch, global_step) print('Saving checkpoint takes {:.4f}s.'.format( time.perf_counter() - start)) if (global_step + 1) % cfg.vis.vis_every == 0 and cfg.vis.ison: print('Doing visualization...') start = time.perf_counter() vis_logger.train_vis(model, valset, writer, global_step, cfg.vis.indices, cfg.device, cond_steps=cfg.vis.cond_steps, fg_sample=cfg.vis.fg_sample, bg_sample=cfg.vis.bg_sample, num_gen=cfg.vis.num_gen) print( 'Visualization takes {:.4f}s.'.format(time.perf_counter() - start)) if (global_step + 1) % cfg.val.val_every == 0 and cfg.val.ison: print('Doing evaluation...') start = time.perf_counter() evaluator.train_eval( evaluator, os.path.join(cfg.evaldir, cfg.exp_name), cfg.val.metrics, cfg.val.eval_types, cfg.val.intervals, cfg.val.cond_steps, model, valset, valloader, cfg.device, writer, global_step, [model, optimizer, epoch, global_step], checkpointer) print('Evaluation takes {:.4f}s.'.format(time.perf_counter() - start)) start = time.perf_counter() global_step += 1 if global_step >= cfg.train.max_steps: end_flag = True break
def train(args): print(args) dataset = MovieLens( args.data_name, args.ctx, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm, test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio, ) print("Loading data finished ...\n") args.src_in_units = dataset.user_feature_shape[1] args.dst_in_units = dataset.movie_feature_shape[1] args.rating_vals = dataset.possible_rating_values ### build the net net = Net(args=args) net.initialize(init=mx.init.Xavier(factor_type="in"), ctx=args.ctx) net.hybridize() nd_possible_rating_values = mx.nd.array(dataset.possible_rating_values, ctx=args.ctx, dtype=np.float32) rating_loss_net = gluon.loss.SoftmaxCELoss() rating_loss_net.hybridize() trainer = gluon.Trainer(net.collect_params(), args.train_optimizer, {"learning_rate": args.train_lr}) print("Loading network finished ...\n") ### perpare training data train_gt_labels = dataset.train_labels train_gt_ratings = dataset.train_truths ### prepare the logger train_loss_logger = MetricLogger( ["iter", "loss", "rmse"], ["%d", "%.4f", "%.4f"], os.path.join(args.save_dir, "train_loss%d.csv" % args.save_id), ) valid_loss_logger = MetricLogger( ["iter", "rmse"], ["%d", "%.4f"], os.path.join(args.save_dir, "valid_loss%d.csv" % args.save_id), ) test_loss_logger = MetricLogger( ["iter", "rmse"], ["%d", "%.4f"], os.path.join(args.save_dir, "test_loss%d.csv" % args.save_id), ) ### declare the loss information best_valid_rmse = np.inf no_better_valid = 0 best_iter = -1 avg_gnorm = 0 count_rmse = 0 count_num = 0 count_loss = 0 print("Start training ...") dur = [] for iter_idx in range(1, args.train_max_iter): if iter_idx > 3: t0 = time.time() with mx.autograd.record(): pred_ratings = net( dataset.train_enc_graph, dataset.train_dec_graph, dataset.user_feature, dataset.movie_feature, ) loss = rating_loss_net(pred_ratings, train_gt_labels).mean() loss.backward() count_loss += loss.asscalar() gnorm = params_clip_global_norm(net.collect_params(), args.train_grad_clip, args.ctx) avg_gnorm += gnorm trainer.step(1.0) if iter_idx > 3: dur.append(time.time() - t0) if iter_idx == 1: print("Total #Param of net: %d" % (gluon_total_param_num(net))) print( gluon_net_info(net, save_path=os.path.join( args.save_dir, "net%d.txt" % args.save_id))) real_pred_ratings = (mx.nd.softmax(pred_ratings, axis=1) * nd_possible_rating_values.reshape( (1, -1))).sum(axis=1) rmse = mx.nd.square(real_pred_ratings - train_gt_ratings).sum() count_rmse += rmse.asscalar() count_num += pred_ratings.shape[0] if iter_idx % args.train_log_interval == 0: train_loss_logger.log(iter=iter_idx, loss=count_loss / (iter_idx + 1), rmse=count_rmse / count_num) logging_str = "Iter={}, gnorm={:.3f}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format( iter_idx, avg_gnorm / args.train_log_interval, count_loss / iter_idx, count_rmse / count_num, np.average(dur), ) avg_gnorm = 0 count_rmse = 0 count_num = 0 if iter_idx % args.train_valid_interval == 0: valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment="valid") valid_loss_logger.log(iter=iter_idx, rmse=valid_rmse) logging_str += ",\tVal RMSE={:.4f}".format(valid_rmse) if valid_rmse < best_valid_rmse: best_valid_rmse = valid_rmse no_better_valid = 0 best_iter = iter_idx net.save_parameters(filename=os.path.join( args.save_dir, "best_valid_net{}.params".format( args.save_id))) test_rmse = evaluate(args=args, net=net, dataset=dataset, segment="test") best_test_rmse = test_rmse test_loss_logger.log(iter=iter_idx, rmse=test_rmse) logging_str += ", Test RMSE={:.4f}".format(test_rmse) else: no_better_valid += 1 if (no_better_valid > args.train_early_stopping_patience and trainer.learning_rate <= args.train_min_lr): logging.info( "Early stopping threshold reached. Stop training.") break if no_better_valid > args.train_decay_patience: new_lr = max( trainer.learning_rate * args.train_lr_decay_factor, args.train_min_lr) if new_lr < trainer.learning_rate: logging.info("\tChange the LR to %g" % new_lr) trainer.set_learning_rate(new_lr) no_better_valid = 0 if iter_idx % args.train_log_interval == 0: print(logging_str) print("Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}". format(best_iter, best_valid_rmse, best_test_rmse)) train_loss_logger.close() valid_loss_logger.close() test_loss_logger.close()
def train_linear_one_epoch(train_loader, model, criterion, optimizer, config, device): log_header = 'EPOCH {}'.format(epoch + 1) losses = AverageMeter('Loss', fmt=':.4f') top1 = AverageMeter('Top1', fmt=':4.2f') top5 = AverageMeter('Top5', fmt=':4.2f') lr = AverageMeter('Lr', fmt=":.4f") metric_logger = MetricLogger(delimeter=" | ") metric_logger.add_meter(losses) metric_logger.add_meter(top1) metric_logger.add_meter(top5) metric_logger.add_meter(lr) for step, (img, target) in enumerate( metric_logger.log_every(train_loader, config.system.print_freq, log_header)): img = img.to(device) target = target.to(device) logit = model_sl(img) loss = criterion(logit, target) acc1, acc5 = accuracy(logit, target, topk=(1, 5)) lr_ = optimizer.param_groups[0]['lr'] metric_logger.update(Loss=loss.detach().cpu().item(), Top1=acc1.detach().cpu().item(), Top5=acc5.detach().cpu().item(), Lr=lr_) optimizer.zero_grad() loss.backward() optimizer.step()
def main(): resume = True path = 'data/NYU_DEPTH' batch_size = 16 epochs = 10000 device = torch.device('cuda:0') print_every = 5 # exp_name = 'resnet18_nodropout_new' exp_name = 'only_depth' # exp_name = 'normal_internel' # exp_name = 'sep' lr = 1e-5 weight_decay = 0.0005 log_dir = os.path.join('logs', exp_name) model_dir = os.path.join('checkpoints', exp_name) val_every = 16 save_every = 16 # tensorboard # remove old log is not to resume if not resume: if os.path.exists(log_dir): shutil.rmtree(log_dir) os.makedirs(log_dir) if not os.path.exists(model_dir): os.makedirs(model_dir) tb = SummaryWriter(log_dir) tb.add_custom_scalars({ 'metrics': { 'thres_1.25': ['Multiline', ['thres_1.25/train', 'thres_1.25/test']], 'thres_1.25_2': ['Multiline', ['thres_1.25_2/train', 'thres_1.25_2/test']], 'thres_1.25_3': ['Multiline', ['thres_1.25_3/train', 'thres_1.25_3/test']], 'ard': ['Multiline', ['ard/train', 'ard/test']], 'srd': ['Multiline', ['srd/train', 'srd/test']], 'rmse_linear': ['Multiline', ['rmse_linear/train', 'rmse_linear/test']], 'rmse_log': ['Multiline', ['rmse_log/train', 'rmse_log/test']], 'rmse_log_invariant': ['Multiline', ['rmse_log_invariant/train', 'rmse_log_invariant/test']], } }) # data loader dataset = NYUDepth(path, 'train') dataloader = DataLoader(dataset, batch_size, shuffle=True, num_workers=4) dataset_test = NYUDepth(path, 'test') dataloader_test = DataLoader(dataset_test, batch_size, shuffle=True, num_workers=4) # load model model = FCRN(True) model = model.to(device) # optimizer optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) start_epoch = 0 if resume: model_path = os.path.join(model_dir, 'model.pth') if os.path.exists(model_path): print('Loading checkpoint from {}...'.format(model_path)) # load model and optimizer checkpoint = torch.load(os.path.join(model_dir, 'model.pth'), map_location='cpu') model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) start_epoch = checkpoint['epoch'] print('Model loaded.') else: print('No checkpoint found. Train from scratch') # training metric_logger = MetricLogger() end = time.perf_counter() max_iters = epochs * len(dataloader) def normal_loss(pred, normal, conf): """ :param pred: (B, 3, H, W) :param normal: (B, 3, H, W) :param conf: 1 """ dot_prod = (pred * normal).sum(dim=1) # weighted loss, (B, ) batch_loss = ((1 - dot_prod) * conf[:, 0]).sum(1).sum(1) # normalize, to (B, ) batch_loss /= conf[:, 0].sum(1).sum(1) return batch_loss.mean() def consistency_loss(pred, cloud, normal, conf): """ :param pred: (B, 1, H, W) :param normal: (B, 3, H, W) :param cloud: (B, 3, H, W) :param conf: (B, 1, H, W) """ B, _, _, _ = normal.size() normal = normal.detach() cloud = cloud.clone() cloud[:, 2:3, :, :] = pred # algorithm: use a kernel kernel = torch.ones((1, 1, 7, 7), device=pred.device) kernel = -kernel kernel[0, 0, 3, 3] = 48 cloud_0 = cloud[:, 0:1] cloud_1 = cloud[:, 1:2] cloud_2 = cloud[:, 2:3] diff_0 = F.conv2d(cloud_0, kernel, padding=6, dilation=2) diff_1 = F.conv2d(cloud_1, kernel, padding=6, dilation=2) diff_2 = F.conv2d(cloud_2, kernel, padding=6, dilation=2) # (B, 3, H, W) diff = torch.cat((diff_0, diff_1, diff_2), dim=1) # normalize diff = F.normalize(diff, dim=1) # (B, 1, H, W) dot_prod = (diff * normal).sum(dim=1, keepdim=True) # weighted mean over image dot_prod = torch.abs(dot_prod.view(B, -1)) conf = conf.view(B, -1) loss = (dot_prod * conf).sum(1) / conf.sum(1) # mean over batch return loss.mean() def criterion(depth_pred, normal_pred, depth, normal, cloud, conf): mse_loss = F.mse_loss(depth_pred, depth) consis_loss = consistency_loss(depth_pred, cloud, normal_pred, conf) norm_loss = normal_loss(normal_pred, normal, conf) consis_loss = torch.zeros_like(norm_loss) return mse_loss, mse_loss, mse_loss # return mse_loss, consis_loss, norm_loss # return norm_loss, norm_loss, norm_loss print('Start training') for epoch in range(start_epoch, epochs): # train model.train() for i, data in enumerate(dataloader): start = end i += 1 data = [x.to(device) for x in data] image, depth, normal, conf, cloud = data depth_pred, normal_pred = model(image) mse_loss, consis_loss, norm_loss = criterion(depth_pred, normal_pred, depth, normal, cloud, conf) loss = mse_loss + consis_loss + norm_loss optimizer.zero_grad() loss.backward() optimizer.step() # bookkeeping end = time.perf_counter() metric_logger.update(loss=loss.item()) metric_logger.update(mse_loss=mse_loss.item()) metric_logger.update(norm_loss=norm_loss.item()) metric_logger.update(consis_loss=consis_loss.item()) metric_logger.update(batch_time=end-start) if i % print_every == 0: # Compute eta. global step: starting from 1 global_step = epoch * len(dataloader) + i seconds = (max_iters - global_step) * metric_logger['batch_time'].global_avg eta = datetime.timedelta(seconds=int(seconds)) # to display: eta, epoch, iteration, loss, batch_time display_dict = { 'eta': eta, 'epoch': epoch, 'iter': i, 'loss': metric_logger['loss'].median, 'batch_time': metric_logger['batch_time'].median } display_str = [ 'eta: {eta}s', 'epoch: {epoch}', 'iter: {iter}', 'loss: {loss:.4f}', 'batch_time: {batch_time:.4f}s', ] print(', '.join(display_str).format(**display_dict)) # tensorboard min_depth = depth[0].min() max_depth = depth[0].max() * 1.25 depth = (depth[0] - min_depth) / (max_depth - min_depth) depth_pred = (depth_pred[0] - min_depth) / (max_depth - min_depth) depth_pred = torch.clamp(depth_pred, min=0.0, max=1.0) normal = (normal[0] + 1) / 2 normal_pred = (normal_pred[0] + 1) / 2 conf = conf[0] tb.add_scalar('train/loss', metric_logger['loss'].median, global_step) tb.add_scalar('train/mse_loss', metric_logger['mse_loss'].median, global_step) tb.add_scalar('train/consis_loss', metric_logger['consis_loss'].median, global_step) tb.add_scalar('train/norm_loss', metric_logger['norm_loss'].median, global_step) tb.add_image('train/depth', depth, global_step) tb.add_image('train/normal', normal, global_step) tb.add_image('train/depth_pred', depth_pred, global_step) tb.add_image('train/normal_pred', normal_pred, global_step) tb.add_image('train/conf', conf, global_step) tb.add_image('train/image', image[0], global_step) if (epoch) % val_every == 0 and epoch != 0: # validate after each epoch validate(dataloader, model, device, tb, epoch, 'train') validate(dataloader_test, model, device, tb, epoch, 'test') if (epoch) % save_every == 0 and epoch != 0: to_save = { 'optimizer': optimizer.state_dict(), 'model': model.state_dict(), 'epoch': epoch, } torch.save(to_save, os.path.join(model_dir, 'model.pth'))
def train(cfg): print('Experiment name:', cfg.exp_name) print('Dataset:', cfg.dataset) print('Model name:', cfg.model) print('Resume:', cfg.resume) if cfg.resume: print('Checkpoint:', cfg.resume_ckpt if cfg.resume_ckpt else 'last checkpoint') print('Using device:', cfg.device) if 'cuda' in cfg.device: print('Using parallel:', cfg.parallel) if cfg.parallel: print('Device ids:', cfg.device_ids) print('Loading data') if cfg.exp_name == 'table': data_set = np.load('{}/train/all_set_train.npy'.format( cfg.dataset_roots.TABLE)) data_size = len(data_set) else: trainloader = get_dataloader(cfg, 'train') data_size = len(trainloader) if cfg.train.eval_on: valset = get_dataset(cfg, 'val') # valloader = get_dataloader(cfg, 'val') evaluator = get_evaluator(cfg) model = get_model(cfg) model = model.to(cfg.device) checkpointer = Checkpointer(osp.join(cfg.checkpointdir, cfg.exp_name), max_num=cfg.train.max_ckpt) model.train() optimizer_fg, optimizer_bg = get_optimizers(cfg, model) start_epoch = 0 start_iter = 0 global_step = 0 if cfg.resume: checkpoint = checkpointer.load_last(cfg.resume_ckpt, model, optimizer_fg, optimizer_bg) if checkpoint: start_epoch = checkpoint['epoch'] global_step = checkpoint['global_step'] + 1 if cfg.parallel: model = nn.DataParallel(model, device_ids=cfg.device_ids) writer = SummaryWriter(log_dir=os.path.join(cfg.logdir, cfg.exp_name), flush_secs=30, purge_step=global_step) vis_logger = get_vislogger(cfg) metric_logger = MetricLogger() print('Start training') end_flag = False for epoch in range(start_epoch, cfg.train.max_epochs): if end_flag: break if cfg.exp_name == 'table': # creates indexes and shuffles them. So it can acces the data idx_set = np.arange(data_size) np.random.shuffle(idx_set) idx_set = np.split(idx_set, len(idx_set) / cfg.train.batch_size) data_to_enumerate = idx_set else: trainloader = get_dataloader(cfg, 'train') data_to_enumerate = trainloader data_size = len(trainloader) start = time.perf_counter() for i, enumerated_data in enumerate(data_to_enumerate): end = time.perf_counter() data_time = end - start start = end model.train() if cfg.exp_name == 'table': data_i = data_set[enumerated_data] data_i = torch.from_numpy(data_i).float().to(cfg.device) data_i /= 255 data_i = data_i.permute([0, 3, 1, 2]) imgs = data_i else: imgs = enumerated_data imgs = imgs.to(cfg.device) loss, log = model(imgs, global_step) # In case of using DataParallel loss = loss.mean() optimizer_fg.zero_grad() optimizer_bg.zero_grad() loss.backward() if cfg.train.clip_norm: clip_grad_norm_(model.parameters(), cfg.train.clip_norm) optimizer_fg.step() # if cfg.train.stop_bg == -1 or global_step < cfg.train.stop_bg: optimizer_bg.step() end = time.perf_counter() batch_time = end - start metric_logger.update(data_time=data_time) metric_logger.update(batch_time=batch_time) metric_logger.update(loss=loss.item()) if (global_step) % cfg.train.print_every == 0: start = time.perf_counter() log.update({ 'loss': metric_logger['loss'].median, }) vis_logger.train_vis(writer, log, global_step, 'train') end = time.perf_counter() print( 'exp: {}, epoch: {}, iter: {}/{}, global_step: {}, loss: {:.2f}, batch time: {:.4f}s, data time: {:.4f}s, log time: {:.4f}s' .format(cfg.exp_name, epoch + 1, i + 1, data_size, global_step, metric_logger['loss'].median, metric_logger['batch_time'].avg, metric_logger['data_time'].avg, end - start)) if (global_step) % cfg.train.create_image_every == 0: vis_logger.test_create_image( log, '../output/{}_img_{}.png'.format(cfg.dataset, global_step)) if (global_step) % cfg.train.save_every == 0: start = time.perf_counter() checkpointer.save_last(model, optimizer_fg, optimizer_bg, epoch, global_step) print('Saving checkpoint takes {:.4f}s.'.format( time.perf_counter() - start)) if (global_step) % cfg.train.eval_every == 0 and cfg.train.eval_on: pass '''print('Validating...') start = time.perf_counter() checkpoint = [model, optimizer_fg, optimizer_bg, epoch, global_step] if cfg.exp_name == 'table': evaluator.train_eval(model, None, None, writer, global_step, cfg.device, checkpoint, checkpointer) else: evaluator.train_eval(model, valset, valset.bb_path, writer, global_step, cfg.device, checkpoint, checkpointer) print('Validation takes {:.4f}s.'.format(time.perf_counter() - start))''' start = time.perf_counter() global_step += 1 if global_step > cfg.train.max_steps: end_flag = True break
def train(args): dataset = MovieLens(args.data_name, args.ctx, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm) print("Loading data finished ...\n") args.src_key = dataset.name_user args.dst_key = dataset.name_movie args.src_in_units = dataset.user_feature.shape[1] args.dst_in_units = dataset.movie_feature.shape[1] args.nratings = dataset.possible_rating_values.size ### build the net net = Net(args=args) net.initialize(init=mx.init.Xavier(factor_type='in'), ctx=args.ctx) net.hybridize() if args.gen_r_use_classification: nd_possible_rating_values = mx.nd.array(dataset.possible_rating_values, ctx=args.ctx, dtype=np.float32) rating_loss_net = gluon.loss.SoftmaxCELoss() else: rating_mean = dataset.train_rating_values.mean() rating_std = dataset.train_rating_values.std() rating_loss_net = gluon.loss.L2Loss() rating_loss_net.hybridize() trainer = gluon.Trainer(net.collect_params(), args.train_optimizer, {'learning_rate': args.train_lr}) print("Loading network finished ...\n") ### perpare training data train_rating_pairs = mx.nd.array(dataset.train_rating_pairs, ctx=args.ctx, dtype=np.int64) train_gt_ratings = mx.nd.array(dataset.train_rating_values, ctx=args.ctx, dtype=np.float32) ### prepare the logger train_loss_logger = MetricLogger( ['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'], os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id)) valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], os.path.join( args.save_dir, 'valid_loss%d.csv' % args.save_id)) test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], os.path.join( args.save_dir, 'test_loss%d.csv' % args.save_id)) ### declare the loss information best_valid_rmse = np.inf no_better_valid = 0 best_iter = -1 avg_gnorm = 0 count_rmse = 0 count_num = 0 count_loss = 0 print("Start training ...") for iter_idx in range(1, args.train_max_iter): if args.gen_r_use_classification: train_gt_label = mx.nd.array(np.searchsorted( dataset.possible_rating_values, dataset.train_rating_values), ctx=args.ctx, dtype=np.int32) with mx.autograd.record(): pred_ratings = net(dataset.train_graph, train_rating_pairs) if args.gen_r_use_classification: loss = rating_loss_net(pred_ratings, train_gt_label).mean() else: loss = rating_loss_net( mx.nd.reshape(pred_ratings, shape=(-1, )), (train_gt_ratings - rating_mean) / rating_std).mean() #loss.wait_to_read() loss.backward() count_loss += loss.asscalar() gnorm = params_clip_global_norm(net.collect_params(), args.train_grad_clip, args.ctx) avg_gnorm += gnorm trainer.step(1.0) #, ignore_stale_grad=True) if iter_idx == 1: print("Total #Param of net: %d" % (gluon_total_param_num(net))) print( gluon_net_info(net, save_path=os.path.join( args.save_dir, 'net%d.txt' % args.save_id))) if args.gen_r_use_classification: real_pred_ratings = (mx.nd.softmax(pred_ratings, axis=1) * nd_possible_rating_values.reshape( (1, -1))).sum(axis=1) rmse = mx.nd.square(real_pred_ratings - train_gt_ratings).sum() else: rmse = mx.nd.square( pred_ratings.reshape((-1, )) * rating_std + rating_mean - train_gt_ratings).sum() count_rmse += rmse.asscalar() count_num += pred_ratings.shape[0] if iter_idx % args.train_log_interval == 0: train_loss_logger.log(iter=iter_idx, loss=count_loss / (iter_idx + 1), rmse=count_rmse / count_num) logging_str = "Iter={}, gnorm={:.3f}, loss={:.4f}, rmse={:.4f}".format( iter_idx, avg_gnorm / args.train_log_interval, count_loss / iter_idx, count_rmse / count_num) avg_gnorm = 0 count_rmse = 0 count_num = 0 if iter_idx % args.train_valid_interval == 0: valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid') valid_loss_logger.log(iter=iter_idx, rmse=valid_rmse) logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse) if valid_rmse < best_valid_rmse: best_valid_rmse = valid_rmse no_better_valid = 0 best_iter = iter_idx #net.save_parameters(filename=os.path.join(args.save_dir, 'best_valid_net{}.params'.format(args.save_id))) test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test') best_test_rmse = test_rmse test_loss_logger.log(iter=iter_idx, rmse=test_rmse) logging_str += ', Test RMSE={:.4f}'.format(test_rmse) else: no_better_valid += 1 if no_better_valid > args.train_early_stopping_patience\ and trainer.learning_rate <= args.train_min_lr: logging.info( "Early stopping threshold reached. Stop training.") break if no_better_valid > args.train_decay_patience: new_lr = max( trainer.learning_rate * args.train_lr_decay_factor, args.train_min_lr) if new_lr < trainer.learning_rate: logging.info("\tChange the LR to %g" % new_lr) trainer.set_learning_rate(new_lr) no_better_valid = 0 if iter_idx % args.train_log_interval == 0: print(logging_str) print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'. format(best_iter, best_valid_rmse, best_test_rmse)) train_loss_logger.close() valid_loss_logger.close() test_loss_logger.close()
def train(args): print(args) dataset = MovieLens(args.data_name, args.ctx, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm, test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio) print("Loading data finished ...\n") args.src_in_units = dataset.user_feature_shape[1] args.dst_in_units = dataset.movie_feature_shape[1] args.rating_vals = dataset.possible_rating_values ### build the net net = Net(args=args) net.initialize(init=mx.init.Xavier(factor_type='in'), ctx=args.ctx) net.hybridize() rating_loss_net = gluon.loss.SoftmaxCELoss() rating_loss_net.hybridize() trainer = gluon.Trainer(net.collect_params(), args.train_optimizer, {'learning_rate': args.train_lr}) print("Loading network finished ...\n") ### perpare training data train_gt_labels = dataset.train_labels train_gt_ratings = dataset.train_truths ### prepare the logger train_loss_logger = MetricLogger(['iter', 'idx', 'loss', 'rmse'], ['%d', '%d', '%.4f', '%.4f'], os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id)) valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id)) test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id)) ### declare the loss information best_valid_rmse = np.inf no_better_valid = 0 best_iter = -1 enc_graph = dataset.train_enc_graph nd_possible_rating_values = mx.nd.array(dataset.possible_rating_values, ctx=args.ctx, dtype=np.float32) g_user_fea = mx.nd.zeros((dataset.num_user,)) g_movie_fea = mx.nd.zeros((dataset.num_movie,)) train_truths = dataset.train_truths train_labels = dataset.train_labels print("Start training ...") dur = [] for iter_idx in range(1, args.train_max_iter): if iter_idx > 3: t0 = time.time() num_edges = dataset.train_truths.shape[0] seed = mx.nd.arange(num_edges, dtype='int64') edges = mx.nd.shuffle(seed) # each iteration will go through all edges for sample_idx in range(0, (num_edges + args.minibatch_size - 1) // args.minibatch_size): edge_ids = edges[sample_idx * args.minibatch_size: (sample_idx + 1) * args.minibatch_size if (sample_idx + 1) * args.minibatch_size < num_edges else num_edges] head_ids, tail_ids = dataset.train_dec_graph.find_edges(edge_ids.asnumpy()) head_subgraphs = {} tail_subgraphs = {} head_node_ids = np.unique(head_ids.asnumpy()) tail_node_ids = np.unique(tail_ids.asnumpy()) for i, _ in enumerate(args.rating_vals): t = enc_graph.canonical_etypes[i * 2] rev_t = enc_graph.canonical_etypes[i * 2 + 1] head_in_edges = enc_graph.in_edges(head_node_ids, 'eid', etype=rev_t) tail_in_edges = enc_graph.in_edges(tail_node_ids, 'eid', etype=t) if head_in_edges.shape[0] > 0: head_subgraphs[rev_t] = head_in_edges if tail_in_edges.shape[0] > 0: tail_subgraphs[t] = tail_in_edges head_subgraph = enc_graph.edge_subgraph(head_subgraphs, preserve_nodes=True) tail_subgraph = enc_graph.edge_subgraph(tail_subgraphs, preserve_nodes=True) edge_ids = edge_ids.as_in_context(args.ctx) true_relation_ratings = train_truths[edge_ids] true_relation_labels = train_labels[edge_ids] head_NID = head_subgraph.nodes['user'].data[dgl.NID] tail_NID = tail_subgraph.nodes['movie'].data[dgl.NID] g_user_fea[head_NID] = mx.nd.arange(head_NID.shape[0], dtype='int32') g_movie_fea[tail_NID] = mx.nd.arange(tail_NID.shape[0], dtype='int32') true_head_idx = g_user_fea[head_ids].as_in_context(args.ctx) true_tail_idx = g_movie_fea[tail_ids].as_in_context(args.ctx) with mx.autograd.record(): pred_ratings = net(head_subgraph, tail_subgraph, true_head_idx, true_tail_idx) loss = rating_loss_net(pred_ratings, true_relation_labels).mean() loss.backward() gnorm = params_clip_global_norm(net.collect_params(), args.train_grad_clip, args.ctx) trainer.step(1.0, ignore_stale_grad=True) real_pred_ratings = (mx.nd.softmax(pred_ratings, axis=1) * nd_possible_rating_values.reshape((1, -1))).sum(axis=1) rmse = mx.nd.square(real_pred_ratings - true_relation_ratings).mean().asscalar() rmse = np.sqrt(rmse) loss = loss.asscalar() if sample_idx % 100 == 0: train_loss_logger.log(iter=iter_idx, idx=sample_idx, loss=loss, rmse=rmse) print("Iter={}, sample_idx={}, gnorm={:.3f}, loss={:.4f}, rmse={:.4f}".format(iter_idx, sample_idx, gnorm, loss, rmse)) gc.collect() if iter_idx > 3: dur.append(time.time() - t0) if iter_idx == 1: print("Total #Param of net: %d" % (gluon_total_param_num(net))) print(gluon_net_info(net, save_path=os.path.join(args.save_dir, 'net%d.txt' % args.save_id))) if iter_idx % args.train_log_interval == 0: logging_str = "Iter={}, time={:.4f}".format( iter_idx, np.average(dur)) if iter_idx % args.train_valid_interval == 0: valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid') valid_loss_logger.log(iter = iter_idx, rmse = valid_rmse) logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse) if valid_rmse < best_valid_rmse: best_valid_rmse = valid_rmse no_better_valid = 0 best_iter = iter_idx #net.save_parameters(filename=os.path.join(args.save_dir, 'best_valid_net{}.params'.format(args.save_id))) test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test') best_test_rmse = test_rmse test_loss_logger.log(iter=iter_idx, rmse=test_rmse) logging_str += ', Test RMSE={:.4f}'.format(test_rmse) else: no_better_valid += 1 if no_better_valid > args.train_early_stopping_patience\ and trainer.learning_rate <= args.train_min_lr: logging.info("Early stopping threshold reached. Stop training.") break if no_better_valid > args.train_decay_patience: new_lr = max(trainer.learning_rate * args.train_lr_decay_factor, args.train_min_lr) if new_lr < trainer.learning_rate: logging.info("\tChange the LR to %g" % new_lr) trainer.set_learning_rate(new_lr) no_better_valid = 0 if iter_idx % args.train_log_interval == 0: print(logging_str) print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.format( best_iter, best_valid_rmse, best_test_rmse)) train_loss_logger.close() valid_loss_logger.close() test_loss_logger.close()
def train(args): print(args) if args.data_name == 'jukebox': dataset = JukeboxDataset('dataset/listen_count.txt') else: dataset = MovieLens(args.data_name, args.device, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm, test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio) print("Loading data finished ...\n") args.src_in_units = dataset.user_feature_shape[1] args.dst_in_units = dataset.movie_feature_shape[1] args.rating_vals = dataset.possible_rating_values ### build the net net = Net(args=args) net = net.to(args.device) nd_possible_rating_values = th.FloatTensor(dataset.possible_rating_values).to(args.device) rating_loss_net = nn.MSELoss() learning_rate = args.train_lr optimizer = get_optimizer(args.train_optimizer)(net.parameters(), lr=learning_rate) print("Loading network finished ...\n") ### perpare training data train_gt_labels = dataset.train_labels train_gt_ratings = dataset.train_truths ### prepare the logger train_loss_logger = MetricLogger(['iter', 'loss'], ['%d', '%.4f', '%.4f'], os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id)) valid_loss_logger = MetricLogger(['iter', 'ndcg','precision','recall','fscore','support'], ['%d','%.4f', '%.4f','%s','%s','%s','%s'], os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id)) test_loss_logger = MetricLogger(['iter'], ['%d', '%.4f'], os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id)) ### declare the loss information best_valid_rmse = np.inf no_better_valid = 0 best_iter = -1 count_rmse = 1 count_num = 1 count_loss = 0 count_step = 0 dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device) dataset.train_dec_graph = dataset.train_dec_graph.int().to(args.device) dataset.valid_enc_graph = dataset.train_enc_graph dataset.valid_dec_graph = dataset.valid_dec_graph.int().to(args.device) dataset.test_enc_graph = dataset.test_enc_graph.int().to(args.device) dataset.test_dec_graph = dataset.test_dec_graph.int().to(args.device) def batch(iterable, n=1): current_batch = [] for item in iterable: current_batch.append(item) if len(current_batch) == n: yield current_batch current_batch = [] if current_batch: yield current_batch batches = [] print("Start training ...") dur = [] for iter_idx in range(1, args.train_max_iter): if iter_idx > 3: t0 = time.time() net.train() unique_item_list = dataset.train['item_id'].unique().tolist() ufeat, ifeat = net.encoder(dataset.train_enc_graph, dataset.user_feature, dataset.movie_feature) from tqdm import tqdm if iter_idx ==1: for row in tqdm(list(dataset.train.itertuples())): user, item, rating = row.user_id, row.item_id, row.rating userid = dataset.global_user_id_map[user] observed = dataset.train[dataset.train['user_id'] == user]['item_id'].unique().tolist() negatives = set() while len(negatives) < 1: sample = random.choice(unique_item_list) if sample not in observed: negatives.add(sample) batches.append((userid, dataset.global_item_id_map[item], dataset.global_item_id_map[sample])) for bt in tqdm(list(batch(batches, 2**14))): uidfeat = ufeat[[e[0] for e in bt]] posfeat = ifeat[[e[1] for e in bt]] negfeat = ifeat[[e[2] for e in bt]] pos_scores = uidfeat @ net.decoder.Q @ posfeat.T neg_scores = uidfeat @ net.decoder.Q @ negfeat.T lmbd = 1e-5 mf_loss = -nn.BCELoss()(th.sigmoid(pos_scores), th.ones_like(pos_scores)) + nn.LogSigmoid()(pos_scores - neg_scores).mean() mf_loss = -1 * mf_loss regularizer = (th.norm(uidfeat,dim=1)**2).mean() + (th.norm(posfeat,dim=1)**2).mean() + (th.norm(negfeat,dim=1)**2).mean() + (th.norm(net.decoder.Q)) emb_loss = lmbd * regularizer print('mf_loss', mf_loss) print('emb_loss', emb_loss) optimizer.zero_grad() loss = mf_loss + emb_loss count_loss += loss.item() loss.backward() nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip) optimizer.step() ufeat, ifeat = net.encoder(dataset.train_enc_graph, dataset.user_feature, dataset.movie_feature) count_step += 1 print('train done') if iter_idx > 3: dur.append(time.time() - t0) if iter_idx == 1: print("Total #Param of net: %d" % (torch_total_param_num(net))) print(torch_net_info(net, save_path=os.path.join(args.save_dir, 'net%d.txt' % args.save_id))) if iter_idx % args.train_log_interval == 0: train_loss_logger.log(iter=iter_idx, loss=count_loss / (count_step + 1)) logging_str = "Iter={}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format( iter_idx, count_loss/(count_step + 1), count_rmse/count_num, np.average(dur)) count_rmse = 1 count_num = 1 if iter_idx % args.train_valid_interval == 0: valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid') precision, recall, fscore, support = evaluate_others(args=args, net=net, dataset=dataset, segment='valid') ndcg = evaluate_ndcg(args=args, net=net, dataset=dataset, segment='valid') print('ndcg', ndcg, 'precision', precision, 'recall', recall, 'fscore', fscore, 'support', support) valid_loss_logger.log(iter=iter_idx, ndcg=ndcg, precision=precision, recall=recall, fscore=fscore, support=support) logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse) if valid_rmse < best_valid_rmse: best_valid_rmse = valid_rmse no_better_valid = 0 best_iter = iter_idx test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test') best_test_rmse = test_rmse test_loss_logger.log(iter=iter_idx) logging_str += ', Test RMSE={:.4f}'.format(test_rmse) else: no_better_valid += 1 if no_better_valid > args.train_early_stopping_patience\ and learning_rate <= args.train_min_lr: logging.info("Early stopping threshold reached. Stop training.") break if no_better_valid > args.train_decay_patience: new_lr = max(learning_rate * args.train_lr_decay_factor, args.train_min_lr) if new_lr < learning_rate: learning_rate = new_lr logging.info("\tChange the LR to %g" % new_lr) for p in optimizer.param_groups: p['lr'] = learning_rate no_better_valid = 0 if iter_idx % args.train_log_interval == 0: print(logging_str) print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.format( best_iter, best_valid_rmse, best_test_rmse)) train_loss_logger.close() valid_loss_logger.close() test_loss_logger.close()
def train(cfg): print('Experiment name:', cfg.exp_name) print('Dataset:', cfg.dataset) print('Model name:', cfg.model) print('Resume:', cfg.resume) if cfg.resume: print('Checkpoint:', cfg.resume_ckpt if cfg.resume_ckpt else 'last checkpoint') print('Using device:', cfg.device) if 'cuda' in cfg.device: print('Using parallel:', cfg.parallel) if cfg.parallel: print('Device ids:', cfg.device_ids) print('Loading data') trainloader = get_dataloader(cfg, 'train') if cfg.train.eval_on: valset = get_dataset(cfg, 'val') # valloader = get_dataloader(cfg, 'val') evaluator = get_evaluator(cfg) model = get_model(cfg) model = model.to(cfg.device) checkpointer = Checkpointer(osp.join(cfg.checkpointdir, cfg.exp_name), max_num=cfg.train.max_ckpt) model.train() optimizer_fg, optimizer_bg = get_optimizers(cfg, model) start_epoch = 0 start_iter = 0 global_step = 0 if cfg.resume: checkpoint = checkpointer.load_last(cfg.resume_ckpt, model, optimizer_fg, optimizer_bg) if checkpoint: start_epoch = checkpoint['epoch'] global_step = checkpoint['global_step'] + 1 if cfg.parallel: model = nn.DataParallel(model, device_ids=cfg.device_ids) writer = SummaryWriter(log_dir=os.path.join(cfg.logdir, cfg.exp_name), flush_secs=30, purge_step=global_step) vis_logger = get_vislogger(cfg) metric_logger = MetricLogger() print('Start training') end_flag = False for epoch in range(start_epoch, cfg.train.max_epochs): if end_flag: break start = time.perf_counter() for i, data in enumerate(trainloader): end = time.perf_counter() data_time = end - start start = end model.train() imgs = data imgs = imgs.to(cfg.device) loss, log = model(imgs, global_step) # In case of using DataParallel loss = loss.mean() optimizer_fg.zero_grad() optimizer_bg.zero_grad() loss.backward() if cfg.train.clip_norm: clip_grad_norm_(model.parameters(), cfg.train.clip_norm) optimizer_fg.step() # if cfg.train.stop_bg == -1 or global_step < cfg.train.stop_bg: optimizer_bg.step() end = time.perf_counter() batch_time = end - start metric_logger.update(data_time=data_time) metric_logger.update(batch_time=batch_time) metric_logger.update(loss=loss.item()) if (global_step) % cfg.train.print_every == 0: start = time.perf_counter() log.update({ 'loss': metric_logger['loss'].median, }) vis_logger.train_vis(writer, log, global_step, 'train') end = time.perf_counter() print( 'exp: {}, epoch: {}, iter: {}/{}, global_step: {}, loss: {:.2f}, batch time: {:.4f}s, data time: {:.4f}s, log time: {:.4f}s'.format( cfg.exp_name, epoch + 1, i + 1, len(trainloader), global_step, metric_logger['loss'].median, metric_logger['batch_time'].avg, metric_logger['data_time'].avg, end - start)) if (global_step) % cfg.train.save_every == 0: start = time.perf_counter() checkpointer.save_last(model, optimizer_fg, optimizer_bg, epoch, global_step) print('Saving checkpoint takes {:.4f}s.'.format(time.perf_counter() - start)) if (global_step) % cfg.train.eval_every == 0 and cfg.train.eval_on: print('Validating...') start = time.perf_counter() checkpoint = [model, optimizer_fg, optimizer_bg, epoch, global_step] evaluator.train_eval(model, valset, valset.bb_path, writer, global_step, cfg.device, checkpoint, checkpointer) print('Validation takes {:.4f}s.'.format(time.perf_counter() - start)) start = time.perf_counter() global_step += 1 if global_step > cfg.train.max_steps: end_flag = True break
def training(model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments): logger = logging.getLogger("RetinaNet.trainer") logger.info("Start training") meters = MetricLogger(delimiter=" ") max_iter = len(data_loader) start_iter = arguments["iteration"] model.train() start_training_time = time.time() end = time.time() for iteration, (images, targets, _) in enumerate(data_loader, start_iter): data_time = time.time() - end iteration = iteration + 1 arguments["iteration"] = iteration scheduler.step() images = images.to(device) targets = [target.to(device) for target in targets] loss_dict = model(images, targets) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() optimizer.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if iteration % 20 == 0 or iteration == max_iter: logger.info( meters.delimiter.join([ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration), **arguments) if iteration == max_iter: checkpointer.save("model_final", **arguments) total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter)))
def train(inputs, outputs, args, logger): """ :param: - inputs: (list) 作为输入的tensor, 它是由preprocess.py处理得的 - outputs: (tensor) 作为标注的tensor, 它是由preprocess.py处理得的 - args: 一堆训练前规定好的参数 - logger: 训练日志,可以把训练过程记录在./ckpt/log.txt :return: 训练结束 """ # 创建数据集 # inputs[0] (50000,1024)即(data_num,max_input_len) # outputs (50000) 即(data_num) torch_dataset = Data.TensorDataset(inputs[0], inputs[1], inputs[2], outputs) loader = Data.DataLoader(dataset=torch_dataset, batch_size=args.batch_size, shuffle=True) logger.info('[1] Building model') # 查看运行训练脚本时,所用的设备,如果有cuda,就用cuda device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 构造 model model = AlbertClassifierModel(num_topics=args.num_topics, out_channels=args.out_channels, max_input_len=args.max_input_len, kernel_size=args.kernel_size, dropout=args.dropout).to(device) model_kwargs = {k: getattr(args, k) for k in {'num_topics', 'out_channels', 'max_input_len', 'kernel_size', 'dropout'} } logger.info(model) # 优化器 if args.optim == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optim == 'adamw': optimizer = optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optim == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) meters = MetricLogger(delimiter=" ") # BCEWithLogitsLoss是不需要sigmoid的二分类损失函数 criterion = nn.CrossEntropyLoss() # scheduler,在schedule_step的时候,把学习率乘0.1,目前只在第一个step做了这个下降 scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [args.schedule_step], gamma=0.1) logger.info('[2] Start training......') for epoch_num in range(args.max_epoch): # example_num:一个epoch需要训练多少个batch example_num = outputs.shape[0] // args.batch_size for batch_iter, (input_ids, segments_tensor, attention_mask, label) in enumerate(loader): progress = epoch_num + batch_iter / example_num optimizer.zero_grad() batch_size = args.batch_size # 正向传播 pred = model(input_ids.to(device).view(batch_size, -1), attention_mask.to(device).view(batch_size, -1)) # 处理 label if label.shape[0] != args.batch_size: logger.info('last dummy batch') break label = label.view(args.batch_size) label = label.to(device) loss = criterion(pred, label) # 反向传播 loss.backward() optimizer.step() meters.update(loss=loss) # 每过0.01个epoch记录一次loss if (batch_iter + 1) % (example_num // 100) == 0: logger.info( meters.delimiter.join( [ "progress: {prog:.2f}", "{meters}", ] ).format( prog=progress, meters=str(meters), ) ) # debug模式,先去valid if args.debug: break # 验证这个epoch的效果 precision, score = validate(model, device, args) logger.info("val") logger.info("precision") logger.info(precision) logger.info("official score") logger.info(score) save = { 'kwargs': model_kwargs, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), } scheduler.step() # 每个epoch保留一个ckpt torch.save(save, os.path.join(args.save_dir, 'model_epoch%d_val%.3f.pt' % (epoch_num, score)))
def train(args): print(args) # dataset = MovieLens(args.data_name, args.device, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm, # test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio) dataset = DataSetLoader(args.data_name, args.device, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm, test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio) print("Loading data finished ...\n") args.src_in_units = dataset.user_feature_shape[1] args.dst_in_units = dataset.movie_feature_shape[1] args.rating_vals = dataset.possible_rating_values ### build the net net = Net(args=args) net = net.to(args.device) nd_possible_rating_values = th.FloatTensor(dataset.possible_rating_values).to(args.device) rating_loss_net = nn.CrossEntropyLoss() learning_rate = args.train_lr optimizer = get_optimizer(args.train_optimizer)(net.parameters(), lr=learning_rate) print("Loading network finished ...\n") ### perpare training data train_gt_labels = dataset.train_labels train_gt_ratings = dataset.train_truths ### prepare the logger train_loss_logger = MetricLogger(['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'], os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id)) valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id)) test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id)) ### declare the loss information best_valid_rmse = np.inf no_better_valid = 0 best_iter = -1 count_rmse = 0 count_num = 0 count_loss = 0 dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device) dataset.train_dec_graph = dataset.train_dec_graph.int().to(args.device) dataset.valid_enc_graph = dataset.train_enc_graph dataset.valid_dec_graph = dataset.valid_dec_graph.int().to(args.device) dataset.test_enc_graph = dataset.test_enc_graph.int().to(args.device) dataset.test_dec_graph = dataset.test_dec_graph.int().to(args.device) print("Start training ...") dur = [] for iter_idx in range(1, args.train_max_iter): if iter_idx > 3: t0 = time.time() net.train() pred_ratings = net(dataset.train_enc_graph, dataset.train_dec_graph, dataset.user_feature, dataset.movie_feature) loss = rating_loss_net(pred_ratings, train_gt_labels).mean() count_loss += loss.item() optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip) optimizer.step() if iter_idx > 3: dur.append(time.time() - t0) if iter_idx == 1: print("Total #Param of net: %d" % (torch_total_param_num(net))) print(torch_net_info(net, save_path=os.path.join(args.save_dir, 'net%d.txt' % args.save_id))) real_pred_ratings = (th.softmax(pred_ratings, dim=1) * nd_possible_rating_values.view(1, -1)).sum(dim=1) rmse = ((real_pred_ratings - train_gt_ratings) ** 2).sum() count_rmse += rmse.item() count_num += pred_ratings.shape[0] if iter_idx % args.train_log_interval == 0: train_loss_logger.log(iter=iter_idx, loss=count_loss/(iter_idx+1), rmse=count_rmse/count_num) logging_str = "Iter={}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format( iter_idx, count_loss/iter_idx, count_rmse/count_num, np.average(dur)) count_rmse = 0 count_num = 0 if iter_idx % args.train_valid_interval == 0: valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid') valid_loss_logger.log(iter = iter_idx, rmse = valid_rmse) logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse) if valid_rmse < best_valid_rmse: best_valid_rmse = valid_rmse no_better_valid = 0 best_iter = iter_idx test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test') best_test_rmse = test_rmse test_loss_logger.log(iter=iter_idx, rmse=test_rmse) logging_str += ', Test RMSE={:.4f}'.format(test_rmse) else: no_better_valid += 1 if no_better_valid > args.train_early_stopping_patience\ and learning_rate <= args.train_min_lr: logging.info("Early stopping threshold reached. Stop training.") break if no_better_valid > args.train_decay_patience: new_lr = max(learning_rate * args.train_lr_decay_factor, args.train_min_lr) if new_lr < learning_rate: learning_rate = new_lr logging.info("\tChange the LR to %g" % new_lr) for p in optimizer.param_groups: p['lr'] = learning_rate no_better_valid = 0 if iter_idx % args.train_log_interval == 0: print(logging_str) print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.format( best_iter, best_valid_rmse, best_test_rmse)) train_loss_logger.close() valid_loss_logger.close() test_loss_logger.close()
def train_eval(args): logging_config(folder=args.save_dir, name='log{:d}'.format(args.save_id), no_console=False) logging.info(args) ### check context use_cuda = args.gpu >= 0 and th.cuda.is_available() if use_cuda: th.cuda.set_device(args.gpu) ### load data dataset = DataLoader(data_name=args.data_name, seed=args.seed) print(dataset) model = Model(use_KG=True, input_node_dim=args.entity_embed_dim, gnn_model=args.gnn_model, num_gnn_layers=args.gnn_num_layer, n_hidden=args.gnn_hidden_size, dropout=args.dropout_rate, n_entities=dataset.n_KG_entity, n_relations=dataset.n_KG_relation, relation_dim=args.relation_embed_dim, reg_lambda_kg=args.regs, reg_lambda_gnn=args.regs) if use_cuda: model.cuda() logging.info(model) ### optimizer optimizer = optim.Adam(model.parameters(), lr=args.lr) valid_metric_logger = MetricLogger( ['epoch', 'recall', 'ndcg', 'is_best'], ['%d', '%.5f', '%.5f', '%d'], os.path.join(args.save_dir, 'valid{:d}.csv'.format(args.save_id))) test_metric_logger = MetricLogger( ['epoch', 'recall', 'ndcg'], ['%d', '%.5f', '%.5f'], os.path.join(args.save_dir, 'test{:d}.csv'.format(args.save_id))) best_epoch = -1 best_recall = 0.0 train_g = dataset.train_g nid_th = th.LongTensor(train_g.ndata["id"]) etype_th = th.LongTensor(train_g.edata["type"]) if use_cuda: nid_th, etype_th = nid_th.cuda(), etype_th.cuda() train_g.ndata['id'] = nid_th train_g.edata['type'] = etype_th test_g = dataset.test_g nid_th = th.LongTensor(test_g.ndata["id"]) etype_th = th.LongTensor(test_g.edata["type"]) if use_cuda: nid_th, etype_th = nid_th.cuda(), etype_th.cuda() test_g.ndata['id'] = nid_th test_g.edata['type'] = etype_th item_id_range = th.LongTensor(dataset.item_id_range).cuda() if use_cuda \ else th.LongTensor(dataset.item_id_range) for epoch in range(1, args.max_epoch + 1): ### train kg time1 = time() kg_sampler = dataset.KG_sampler(batch_size=args.batch_size_kg) iter = 0 total_loss = 0.0 for h, r, pos_t, neg_t, _ in kg_sampler: iter += 1 model.train() h_th = th.LongTensor(h) r_th = th.LongTensor(r) pos_t_th = th.LongTensor(pos_t) neg_t_th = th.LongTensor(neg_t) if use_cuda: h_th, r_th, pos_t_th, neg_t_th = h_th.cuda(), r_th.cuda( ), pos_t_th.cuda(), neg_t_th.cuda() loss = model.transR(h_th, r_th, pos_t_th, neg_t_th) loss.backward() optimizer.step() optimizer.zero_grad() total_loss += loss.item() if (iter % args.print_every) == 0 or iter == 1: logging.info("Epoch {:04d} Iter {:04d} | Loss {:.4f} ".format( epoch, iter, total_loss / iter)) logging.info('Time for KGE: {:.1f}s, loss {:.4f}'.format( time() - time1, total_loss / iter)) ### train GNN if args.use_attention: time1 = time() print("Compute attention weight in train ...") with th.no_grad(): A_w = model.compute_attention(train_g) train_g.edata['w'] = A_w print("Time: {:.2f}s".format(time() - time1)) time1 = time() cf_sampler = dataset.CF_pair_sampler(batch_size=args.batch_size) iter = 0 total_loss = 0.0 for user_ids, item_pos_ids, item_neg_ids, _ in cf_sampler: iter += 1 model.train() user_ids_th = th.LongTensor(user_ids) item_pos_ids_th = th.LongTensor(item_pos_ids) item_neg_ids_th = th.LongTensor(item_neg_ids) if use_cuda: user_ids_th, item_pos_ids_th, item_neg_ids_th = \ user_ids_th.cuda(), item_pos_ids_th.cuda(), item_neg_ids_th.cuda() embedding = model.gnn(train_g, train_g.ndata['id']) loss = model.get_loss(embedding, user_ids_th, item_pos_ids_th, item_neg_ids_th) loss.backward() # th.nn.utils.clip_grad_norm_(model.parameters(), args.grad_norm) # clip gradients optimizer.step() optimizer.zero_grad() total_loss += loss.item() if (iter % args.print_every) == 0 or iter == 1: logging.info("Epoch {:04d} Iter {:04d} | Loss {:.4f} ".format( epoch, iter, total_loss / iter)) logging.info('Time for GNN: {:.1f}s, loss {:.4f}'.format( time() - time1, total_loss / iter)) if epoch % args.evaluate_every == 0: time1 = time() val_recall, val_ndcg = eval(model, train_g, dataset.train_user_dict, dataset.valid_user_dict, item_id_range, use_cuda, args.use_attention) info = "Epoch{}, [{:.1f}s] val recall:{:.5f}, val ndcg:{:.5f}".format( epoch, time() - time1, val_recall, val_ndcg) # save best model if val_recall > best_recall: valid_metric_logger.log(epoch=epoch, recall=val_recall, ndcg=val_ndcg, is_best=1) best_recall = val_recall #best_ndcg = val_ndcg best_epoch = epoch time1 = time() test_recall, test_ndcg = eval(model, test_g, dataset.train_valid_user_dict, dataset.test_user_dict, item_id_range, use_cuda, args.use_attention) test_metric_logger.log(epoch=epoch, recall=test_recall, ndcg=test_ndcg) info += "\t[{:.1f}s] test recall:{:.5f}, test ndcg:{:.5f}".format( time() - time1, test_recall, test_ndcg) #th.save({'state_dict': model.state_dict(), 'epoch': epoch}, model_state_file) else: valid_metric_logger.log(epoch=epoch, recall=val_recall, ndcg=val_ndcg, is_best=0) recall, ndcg = eval(model, test_g, dataset.train_valid_user_dict, dataset.test_user_dict, item_id_range, use_cuda, args.use_attention) print("test recall:{}, test_ndcg: {}".format(recall, ndcg)) logging.info(info) logging.info( "Final test recall:{:.5f}, test ndcg:{:.5f}, best epoch:{}".format( test_recall, test_ndcg, best_epoch))