def evaluate(self) -> None: # create datasets and loaders dev_dataset = BaseDataset(self.feature_type, self.scene, self.hyper_params, self.fft_params) eval_dataset = BaseDataset(self.feature_type, self.scene, self.hyper_params, self.fft_params, data_path=os.path.join('data', 'eval')) dev_set = ExcerptDataset(dev_dataset, self.feature_type, self.classes, self.hyper_params['excerpt_size'], self.fft_params, overlap_factor=1, rnd_augment=False) eval_set = ExcerptDataset(eval_dataset, self.feature_type, self.classes, self.hyper_params['excerpt_size'], self.fft_params, overlap_factor=1, rnd_augment=False) dev_loader = DataLoader(dev_set, batch_size=1, shuffle=False, num_workers=0) eval_loader = DataLoader(eval_set, batch_size=1, shuffle=False, num_workers=0) # evaluate on all individual files for both dev and eval set eval_loss, metrics_eval, metrics_pp_eval = self.evaluate_model_on_files( eval_loader) dev_loss, metrics_dev, metrics_pp_dev = self.evaluate_model_on_files( dev_loader) # write results to files and log parameters self.write_losses(dev_loss, eval_loss) self.write_metrics(metrics_dev, metrics_eval, metrics_pp_dev, metrics_pp_eval) filtered_results = self.log_params(metrics_eval, metrics_pp_eval) print( f'final eval ER: {filtered_results["final_metric/segment_based/overall/ER"]}' ) print( f'final eval F: {filtered_results["final_metric/segment_based/overall/F"]}' ) print( f'final eval ER (post-processed): {filtered_results["final_metric_pp/segment_based/overall/ER"]}' ) print( f'final eval F (post-processed): {filtered_results["final_metric_pp/segment_based/overall/F"]}' )
def get_train_dataset(config): dataset = BaseDataset(config.train_dir + '/behaviors_parsed.tsv', config.train_dir + '/news_parsed.tsv', config.dataset_attributes) print(f"Load training dataset with size {len(dataset)}.") return dataset
def __init__(self, cfg): self.cfg = cfg self.Image_generator = U_Net(in_ch=3, out_ch=cfg.DATASET.N_CLASS, norm=torch.nn.BatchNorm2d, side='no') train_dataset = BaseDataset(cfg, split='train') valid_dataset = BaseDataset(cfg, split='val') self.train_dataloader = data.DataLoader(train_dataset, batch_size=cfg.DATASET.BATCHSIZE, num_workers=8, shuffle=True, drop_last=True) self.valid_dataloader = data.DataLoader(valid_dataset, batch_size=cfg.DATASET.BATCHSIZE, num_workers=8, shuffle=True, drop_last=True) self.criterion = torch.nn.CrossEntropyLoss(ignore_index=self.cfg.LOSS.IGNORE_INDEX, weight=torch.tensor([1,0.5,0.5,1,3,1,1,1,1]).cuda()) self.ckpt_outdir = os.path.join(cfg.TRAIN.OUTDIR, 'checkpoints') if not os.path.isdir(self.ckpt_outdir): os.mkdir(self.ckpt_outdir) self.val_outdir = os.path.join(cfg.TRAIN.OUTDIR, 'val') if not os.path.isdir(self.val_outdir): os.mkdir(self.val_outdir) self.start_epoch = cfg.TRAIN.RESUME self.n_epoch = cfg.TRAIN.N_EPOCH self.optimizer = torch.optim.Adam([{'params':self.Image_generator.parameters()}], lr=cfg.OPTIMIZER.G_LR, betas=(cfg.OPTIMIZER.BETA1, cfg.OPTIMIZER.BETA2), weight_decay=cfg.OPTIMIZER.WEIGHT_DECAY) iter_per_epoch = len(train_dataset)//cfg.DATASET.BATCHSIZE lambda_poly = lambda iters: pow((1.0 - iters / (cfg.TRAIN.N_EPOCH*iter_per_epoch)), 0.9) self.scheduler = torch.optim.lr_scheduler.LambdaLR(self.optimizer, lr_lambda=lambda_poly,) self.logger = logger(cfg.TRAIN.OUTDIR, name='train') self.running_metrics = runningScore(n_classes=cfg.DATASET.N_CLASS) if self.start_epoch >= 0: self.Image_generator.load_state_dict( torch.load(os.path.join(cfg.TRAIN.OUTDIR, 'checkpoints', '{}epoch.pth'.format(self.start_epoch)))['model']) self.optimizer.load_state_dict( torch.load(os.path.join(cfg.TRAIN.OUTDIR, 'checkpoints', '{}epoch.pth'.format(self.start_epoch)))['optimizer']) log = "Using the {}th checkpoint".format(self.start_epoch) self.logger.info(log) self.Image_generator = self.Image_generator.cuda() self.criterion = self.criterion.cuda()
def train(): writer = SummaryWriter( log_dir= f"../runs/{model_name}/{datetime.datetime.now().replace(microsecond=0).isoformat()}{'-' + os.environ['REMARK'] if 'REMARK' in os.environ else ''}" ) if not os.path.exists('checkpoint'): os.makedirs('checkpoint') try: pretrained_word_embedding = torch.from_numpy( np.load('../data/train/pretrained_word_embedding.npy')).float() except FileNotFoundError: pretrained_word_embedding = None if model_name == 'DKN': try: pretrained_entity_embedding = torch.from_numpy( np.load( '../data/train/pretrained_entity_embedding.npy')).float() except FileNotFoundError: pretrained_entity_embedding = None try: pretrained_context_embedding = torch.from_numpy( np.load( '../data/train/pretrained_context_embedding.npy')).float() except FileNotFoundError: pretrained_context_embedding = None model = Model(config, pretrained_word_embedding, pretrained_entity_embedding, pretrained_context_embedding, writer).to(device) else: model = Model(config, pretrained_word_embedding, writer).to(device) print(model) dataset = BaseDataset('../data/train/behaviors_parsed.tsv', '../data/train/news_parsed.tsv', config.dataset_attributes) print(f"Load training dataset with size {len(dataset)}.") dataloader = iter( DataLoader(dataset, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers, drop_last=True)) optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate) start_time = time.time() loss_full = [] exhaustion_count = 0 step = 0 early_stopping = EarlyStopping() checkpoint_dir = os.path.join('../checkpoint', model_name) Path(checkpoint_dir).mkdir(parents=True, exist_ok=True) checkpoint_path = latest_checkpoint(checkpoint_dir) if checkpoint_path is not None: print(f"Load saved parameters in {checkpoint_path}") checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) step = checkpoint['step'] early_stopping(checkpoint['early_stop_value']) model.train() with tqdm(total=config.num_batches, desc="Training") as pbar: for i in range(1, config.num_batches + 1): try: minibatch = next(dataloader) except StopIteration: exhaustion_count += 1 tqdm.write( f"Training data exhausted for {exhaustion_count} times after {i} batches, reuse the dataset." ) dataloader = iter( DataLoader(dataset, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers, drop_last=True)) minibatch = next(dataloader) step += 1 if model_name == 'LSTUR': y_pred = model(minibatch["user"], minibatch["clicked_news_length"], minibatch["candidate_news"], minibatch["clicked_news"]) elif model_name == 'HiFiArk': y_pred, regularizer_loss = model(minibatch["candidate_news"], minibatch["clicked_news"]) elif model_name == 'TANR': y_pred, topic_classification_loss = model( minibatch["candidate_news"], minibatch["clicked_news"]) else: y_pred = model(minibatch["candidate_news"], minibatch["clicked_news"]) loss = torch.stack([x[0] for x in -F.log_softmax(y_pred, dim=1) ]).mean() if model_name == 'HiFiArk': if i % 10 == 0: writer.add_scalar('Train/BaseLoss', loss.item(), step) writer.add_scalar('Train/RegularizerLoss', regularizer_loss.item(), step) writer.add_scalar('Train/RegularizerBaseRatio', regularizer_loss.item() / loss.item(), step) loss += config.regularizer_loss_weight * regularizer_loss elif model_name == 'TANR': if i % 10 == 0: writer.add_scalar('Train/BaseLoss', loss.item(), step) writer.add_scalar('Train/TopicClassificationLoss', topic_classification_loss.item(), step) writer.add_scalar( 'Train/TopicBaseRatio', topic_classification_loss.item() / loss.item(), step) loss += config.topic_classification_loss_weight * topic_classification_loss loss_full.append(loss.item()) optimizer.zero_grad() loss.backward() optimizer.step() if i % 10 == 0: writer.add_scalar('Train/Loss', loss.item(), step) if i % config.num_batches_show_loss == 0: tqdm.write( f"Time {time_since(start_time)}, batches {i}, current loss {loss.item():.4f}, average loss: {np.mean(loss_full):.4f}" ) if i % config.num_batches_validate == 0: val_auc, val_mrr, val_ndcg5, val_ndcg10 = evaluate( model, '../data/val') writer.add_scalar('Validation/AUC', val_auc, step) writer.add_scalar('Validation/MRR', val_mrr, step) writer.add_scalar('Validation/nDCG@5', val_ndcg5, step) writer.add_scalar('Validation/nDCG@10', val_ndcg10, step) tqdm.write( f"Time {time_since(start_time)}, batches {i}, validation AUC: {val_auc:.4f}, validation MRR: {val_mrr:.4f}, validation nDCG@5: {val_ndcg5:.4f}, validation nDCG@10: {val_ndcg10:.4f}, " ) early_stop, get_better = early_stopping(-val_auc) if early_stop: tqdm.write('Early stop.') break elif get_better: torch.save( { 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'step': step, 'early_stop_value': -val_auc }, f"../checkpoint/{model_name}/ckpt-{step}.pth") pbar.update(1)
def extraction(cfg): # cpu or gpu? if torch.cuda.is_available() and cfg.device is not None: device = torch.device(cfg.device) else: if not torch.cuda.is_available(): print("hey man, buy a GPU!") device = torch.device("cpu") dataset = BaseDataset(path=cfg.dataset_path, dataset=cfg.dataset, mode=cfg.mode) data_loader = DataLoader(dataset, cfg.batch_size, shuffle=False, num_workers=cfg.num_workers) if cfg.model_type == 'UniNet': featnet = FeatNet() featnet.load_state_dict( torch.load(cfg.featnet_path, map_location=device)) featnet.to(device) masknet = MaskNet() masknet.load_state_dict( torch.load(cfg.masknet_path, map_location=device)) masknet.to(device) with torch.no_grad(): featnet.eval() masknet.eval() labels = [] img_names = [] labels_vec = np.zeros((len(dataset), dataset.class_num)) features = np.zeros((len(dataset), 64, 512)) masks = np.zeros((len(dataset), 3, 64, 512)) didx = -1 for img_batch, label_batch, label_vec_batch, img_name_batch in tqdm( data_loader, ncols=80, ascii=True): didx += 1 img_batch = img_batch.to(device) feature_batch = featnet(img_batch) mask_batch = masknet(img_batch) for idx in range(feature_batch.shape[0]): labels_vec[idx + didx, :] = label_vec_batch[idx, :].numpy() labels.append(label_batch[idx]) img_names.append(img_name_batch[idx]) features[idx + didx, :, :] = feature_batch[idx].cpu().numpy() masks[idx + didx, :2, :, :] = mask_batch[idx].cpu().numpy() mask = F.softmax(mask_batch[idx], dim=0).cpu().numpy() masks[idx + didx, 2, :, :] = mask[0] < mask[1] else: if cfg.model_type == 'maxout-feature': model = Maxout_feature() elif cfg.model_type == 'facenet': model = FaceModel(256) model.load_state_dict(torch.load(cfg.model_path, map_location=device)) model.to(device) with torch.no_grad(): model.eval() labels = [] img_names = [] labels_vec = np.zeros((len(dataset), dataset.class_num)) features = np.zeros((len(dataset), 64, 512)) masks = np.ones((len(dataset), 3, 64, 512)) didx = -1 for img_batch, label_batch, label_vec_batch, img_name_batch in tqdm( data_loader, ncols=80, ascii=True): didx += 1 img_batch = img_batch.to(device) feature_batch = model(img_batch) for idx in range(feature_batch.shape[0]): labels_vec[idx + didx, :] = label_vec_batch[idx, :].numpy() labels.append(label_batch[idx]) img_names.append(img_name_batch[idx]) features[idx + didx, :, :] = feature_batch[idx].cpu().numpy() if cfg.save == 'mat': ft_path = 'feature/{}__{}.mat'.format(cfg.model, cfg.dataset) ft_load = { 'features': features, 'masks': masks, 'labels_vec': labels_vec, 'labels': labels } savemat(ft_path, ft_load) elif cfg.save == 'pic': if not os.path.exists('feature/{}__{}'.format(cfg.model, cfg.dataset)): os.makedirs('feature/{}__{}'.format(cfg.model, cfg.dataset)) for idx in range(len(dataset)): feature_img = features[idx, :, :] feature_img = (feature_img - feature_img.min()) / ( feature_img.max() - feature_img.min()) Image.fromarray(feature_img * 255).convert('L').save( 'feature/{}__{}/{}_feature.png'.format(cfg.model, cfg.dataset, img_names[idx])) Image.fromarray(masks[idx, 2, :, :] * 255).convert('L').save( 'feature/{}__{}/{}_mask.png'.format(cfg.model, cfg.dataset, img_names[idx])) return features, masks, labels, labels_vec
def main(eval_mode: bool, feature_type: str, scene: str, hyper_params: dict, network_config: dict, eval_settings: dict, fft_params: dict) -> None: """ Main function that takes hyper-parameters, creates the architecture, trains the model and evaluates it """ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') os.makedirs('results', exist_ok=True) experiment_id = datetime.now().strftime( "%Y%m%d-%H%M%S") + f' - {feature_type} - {scene}' writer = SummaryWriter(log_dir=os.path.join('tensorboard', experiment_id)) shutil.copyfile('config.json', os.path.join( 'results', 'config.json')) # save current config file to results training_dataset = BaseDataset(feature_type, scene, hyper_params, fft_params) # create network classes = util.get_scene_classes(scene) plotter = Plotter(classes, hop_size=fft_params['hop_size'], sampling_rate=22050) # finalize network config parameters network_config['out_features'] = len(classes) if feature_type == 'spec': network_config['n_features'] = fft_params['n_fft'] // 2 + 1 elif feature_type == 'mfcc': network_config['n_features'] = fft_params['n_mfcc'] elif feature_type == 'mels': network_config['n_features'] = fft_params['n_mels'] # create network net = SimpleCNN(**network_config) # Save initial model as "best" model (will be overwritten later) model_path = os.path.join('results', f'best_{feature_type}_{scene}_model.pt') if not os.path.exists(model_path): torch.save(net, model_path) else: # if there already exists a model, just load parameters print(f'reusing pre-trained model: "{model_path}"') net = torch.load(model_path, map_location=torch.device('cpu')) net.to(device) # get loss function loss_fn = torch.nn.BCELoss() # create adam optimizer optimizer = torch.optim.Adam(net.parameters(), lr=hyper_params['learning_rate'], weight_decay=hyper_params['weight_decay']) train_stats_at = eval_settings['train_stats_at'] validate_at = eval_settings['validate_at'] best_loss = np.inf # best validation loss so far progress_bar = tqdm.tqdm(total=hyper_params['n_updates'], desc=f"loss: {np.nan:7.5f}", position=0) update = 0 # current update counter fold_idx = 1 # one random fold (defines split into training and validation set) rnd_augment = hyper_params['rnd_augment'] # create subsets and data loaders if eval_mode: train_subset = training_dataset val_loader = None else: train_subset = Subset(training_dataset, training_dataset.get_fold_indices(fold_idx)[0]) val_subset = Subset(training_dataset, training_dataset.get_fold_indices(fold_idx)[1]) val_set = ExcerptDataset(val_subset, feature_type, classes, hyper_params['excerpt_size'], fft_params, overlap_factor=1, rnd_augment=False) val_loader = DataLoader(val_set, batch_size=hyper_params['batch_size'], shuffle=False, num_workers=0) train_set = ExcerptDataset( train_subset, feature_type, classes, hyper_params['excerpt_size'], fft_params, overlap_factor=hyper_params['train_overlap_factor'], rnd_augment=rnd_augment) train_loader = DataLoader(train_set, batch_size=hyper_params['batch_size'], shuffle=True, num_workers=0) n_updates = hyper_params['n_updates'] # main training loop while update <= n_updates: if rnd_augment and update > 0: # regenerate new excerpts (in background) but use current ones for training train_set.generate_excerpts() for data in train_loader: inputs, targets, audio_file, idx = data inputs = inputs.to(device, dtype=torch.float32) targets = targets.to(device, dtype=torch.float32) optimizer.zero_grad() predictions = net(inputs) loss = loss_fn(predictions, targets) loss.backward() optimizer.step() if update % train_stats_at == 0 and update > 0: # log training loss writer.add_scalar(tag="training/loss", scalar_value=loss.cpu(), global_step=update) if not eval_mode and update % validate_at == 0 and update > 0: # evaluate model on validation set, log parameters and metrics val_loss, metrics, metrics_pp = validate_model( net, val_loader, classes, update, device, plotter) print(f'val_loss: {val_loss}') f_score = metrics['segment_based']['overall']['F'] err_rate = metrics['segment_based']['overall']['ER'] f_score_pp = metrics_pp['segment_based']['overall']['F'] err_rate_pp = metrics_pp['segment_based']['overall']['ER'] print(f'f_score: {f_score}') print(f'err_rate: {err_rate}') print(f'f_score_pp: {f_score_pp}') print(f'err_rate_pp: {err_rate_pp}') params = net.parameters() log_validation_params(writer, val_loss, params, metrics, metrics_pp, update) # Save best model for early stopping if val_loss < best_loss: print( f'{val_loss} < {best_loss}... saving as new {os.path.split(model_path)[-1]}' ) best_loss = val_loss torch.save(net, model_path) if eval_mode: # in eval mode, just compare train_loss train_loss = loss.cpu() if train_loss < best_loss: print( f'{train_loss} < {best_loss}... saving as new {os.path.split(model_path)[-1]}' ) best_loss = train_loss torch.save(net, model_path) # update progress and update-counter progress_bar.set_description(f"loss: {loss:7.5f}", refresh=True) progress_bar.update() update += 1 if update >= n_updates: break progress_bar.close() print('finished training.') print('starting evaluation...') evaluator = evaluation.Evaluator(feature_type, scene, hyper_params, network_config, fft_params, model_path, device, writer, plotter) evaluator.evaluate() print('zipping "results" folder...') util.zip_folder('results', f'results_{feature_type}_{scene}')
def __init__(self, cfg): self.cfg = cfg self.OldLabel_generator = U_Net(in_ch=cfg.DATASET.N_CLASS, out_ch=cfg.DATASET.N_CLASS, side='out') self.Image_generator = U_Net(in_ch=3, out_ch=cfg.DATASET.N_CLASS, side='in') self.discriminator = Discriminator(cfg.DATASET.N_CLASS + 3, cfg.DATASET.IMGSIZE, patch=True) self.criterion_G = GeneratorLoss(cfg.LOSS.LOSS_WEIGHT[0], cfg.LOSS.LOSS_WEIGHT[1], cfg.LOSS.LOSS_WEIGHT[2], ignore_index=cfg.LOSS.IGNORE_INDEX) self.criterion_D = DiscriminatorLoss() train_dataset = BaseDataset(cfg, split='train') valid_dataset = BaseDataset(cfg, split='val') self.train_dataloader = data.DataLoader( train_dataset, batch_size=cfg.DATASET.BATCHSIZE, num_workers=8, shuffle=True, drop_last=True) self.valid_dataloader = data.DataLoader( valid_dataset, batch_size=cfg.DATASET.BATCHSIZE, num_workers=8, shuffle=True, drop_last=True) self.ckpt_outdir = os.path.join(cfg.TRAIN.OUTDIR, 'checkpoints') if not os.path.isdir(self.ckpt_outdir): os.mkdir(self.ckpt_outdir) self.val_outdir = os.path.join(cfg.TRAIN.OUTDIR, 'val') if not os.path.isdir(self.val_outdir): os.mkdir(self.val_outdir) self.start_epoch = cfg.TRAIN.RESUME self.n_epoch = cfg.TRAIN.N_EPOCH self.optimizer_G = torch.optim.Adam( [{ 'params': self.OldLabel_generator.parameters() }, { 'params': self.Image_generator.parameters() }], lr=cfg.OPTIMIZER.G_LR, betas=(cfg.OPTIMIZER.BETA1, cfg.OPTIMIZER.BETA2), # betas=(cfg.OPTIMIZER.BETA1, cfg.OPTIMIZER.BETA2), weight_decay=cfg.OPTIMIZER.WEIGHT_DECAY) self.optimizer_D = torch.optim.Adam( [{ 'params': self.discriminator.parameters(), 'initial_lr': cfg.OPTIMIZER.D_LR }], lr=cfg.OPTIMIZER.D_LR, betas=(cfg.OPTIMIZER.BETA1, cfg.OPTIMIZER.BETA2), # betas=(cfg.OPTIMIZER.BETA1, cfg.OPTIMIZER.BETA2), weight_decay=cfg.OPTIMIZER.WEIGHT_DECAY) iter_per_epoch = len(train_dataset) // cfg.DATASET.BATCHSIZE lambda_poly = lambda iters: pow( (1.0 - iters / (cfg.TRAIN.N_EPOCH * iter_per_epoch)), 0.9) self.scheduler_G = torch.optim.lr_scheduler.LambdaLR( self.optimizer_G, lr_lambda=lambda_poly, ) # last_epoch=(self.start_epoch+1)*iter_per_epoch) self.scheduler_D = torch.optim.lr_scheduler.LambdaLR( self.optimizer_D, lr_lambda=lambda_poly, ) # last_epoch=(self.start_epoch+1)*iter_per_epoch) self.logger = logger(cfg.TRAIN.OUTDIR, name='train') self.running_metrics = runningScore(n_classes=cfg.DATASET.N_CLASS) if self.start_epoch >= 0: self.OldLabel_generator.load_state_dict( torch.load( os.path.join(cfg.TRAIN.OUTDIR, 'checkpoints', '{}epoch.pth'.format( self.start_epoch)))['model_G_N']) self.Image_generator.load_state_dict( torch.load( os.path.join(cfg.TRAIN.OUTDIR, 'checkpoints', '{}epoch.pth'.format( self.start_epoch)))['model_G_I']) self.discriminator.load_state_dict( torch.load( os.path.join(cfg.TRAIN.OUTDIR, 'checkpoints', '{}epoch.pth'.format( self.start_epoch)))['model_D']) self.optimizer_G.load_state_dict( torch.load( os.path.join(cfg.TRAIN.OUTDIR, 'checkpoints', '{}epoch.pth'.format( self.start_epoch)))['optimizer_G']) self.optimizer_D.load_state_dict( torch.load( os.path.join(cfg.TRAIN.OUTDIR, 'checkpoints', '{}epoch.pth'.format( self.start_epoch)))['optimizer_D']) log = "Using the {}th checkpoint".format(self.start_epoch) self.logger.info(log) self.Image_generator = self.Image_generator.cuda() self.OldLabel_generator = self.OldLabel_generator.cuda() self.discriminator = self.discriminator.cuda() self.criterion_G = self.criterion_G.cuda() self.criterion_D = self.criterion_D.cuda()
def train(): writer = SummaryWriter( log_dir= f"./runs/{model_name}/{datetime.datetime.now().replace(microsecond=0).isoformat()}{'-' + os.environ['REMARK'] if 'REMARK' in os.environ else ''}" ) if not os.path.exists('checkpoint'): os.makedirs('checkpoint') try: pretrained_word_embedding = torch.from_numpy( np.load('./data/train/pretrained_word_embedding.npy')).float() except FileNotFoundError: pretrained_word_embedding = None if model_name == 'DKN': try: pretrained_entity_embedding = torch.from_numpy( np.load( './data/train/pretrained_entity_embedding.npy')).float() except FileNotFoundError: pretrained_entity_embedding = None try: pretrained_context_embedding = torch.from_numpy( np.load( './data/train/pretrained_context_embedding.npy')).float() except FileNotFoundError: pretrained_context_embedding = None model = Model(config, pretrained_word_embedding, pretrained_entity_embedding, pretrained_context_embedding).to(device) else: model = Model(config, pretrained_word_embedding).to(device) print(model) dataset = BaseDataset('./data/train/behaviors_parsed.tsv', './data/train/news_parsed.tsv', './data/train/roberta') print(f"Load training dataset with size {len(dataset)}.") ############################################### ''' dataloader = DataLoader(dataset, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers, drop_last=True, pin_memory=True)''' ############################################### # In the step we need to tranform the dataset in federated manner ''' federated_train_loader = sy.FederatedDataLoader(datasets.MNIST( '../data', train=True, download=True, transform=transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] ) ) federated_train_loader = sy.FederatedDataLoader( # <-- this is now a FederatedDataLoader dataset.federate((bob, alice)), # <-- NEW: we distribute the dataset across all the workers, it's now a FederatedDataset batch_size=args.batch_size, shuffle=True, **kwargs) dataloader = iter(sy.FederatedDataLoader(dataset.federate((bob, alice)), batch_size=config.batch_size, shuffle=True, #num_workers=config.num_workers, drop_last=True, #pin_memory=True )) ''' #print(dataset) dataloader = sy.FederatedDataLoader(dataset.federate((bob, alice)), batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers, drop_last=True, pin_memory=True) ############################################### print(f"The training dataset has been loaded!") criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=config.learning_rate) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.95, last_epoch=-1) start_time = time.time() loss_full = [] exhaustion_count = 0 step = 0 early_stopping = EarlyStopping() checkpoint_dir = os.path.join('./checkpoint', model_name) Path(checkpoint_dir).mkdir(parents=True, exist_ok=True) checkpoint_path = latest_checkpoint(checkpoint_dir) ''' if checkpoint_path is not None: print(f"Load saved parameters in {checkpoint_path}") checkpoint = torch.load(checkpoint_path) early_stopping(checkpoint['early_stop_value']) step = checkpoint['step'] model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) model.train() ''' #for i in tqdm(range(1,config.num_epochs * len(dataset) // config.batch_size + 1),desc="Training"): for i, (minibatch, target) in enumerate(dataloader): ##### Get a mini batch of data from federated dataset #minibatch ,_ = next(dataloader) #print(minibatch) #print(minibatch.size()) #exit() #minibatch = next(dataloader) step += 1 if model_name == 'LSTUR': y_pred = model(minibatch["user"], minibatch["clicked_news_length"], minibatch["candidate_news"], minibatch["clicked_news"]) elif model_name == 'HiFiArk': y_pred, regularizer_loss = model(minibatch["candidate_news"], minibatch["clicked_news"]) elif model_name == 'TANR': y_pred, topic_classification_loss = model( minibatch["candidate_news"], minibatch["clicked_news"]) else: ################################################# # Send the model model.send(minibatch.location) minibatch, target = minibatch.to(device), target.to(device) #minibatch = minibatch.to(device) ################################################# y_pred = model(minibatch) #y = torch.zeros(config.batch_size).long().to(device) #print(y_pred.get().size()) #print(y.size()) loss = criterion(y_pred, target) if model_name == 'HiFiArk': if i % 10 == 0: writer.add_scalar('Train/BaseLoss', loss.get(), step) writer.add_scalar('Train/RegularizerLoss', regularizer_loss.get(), step) writer.add_scalar('Train/RegularizerBaseRatio', regularizer_loss.get() / loss.get(), step) loss += config.regularizer_loss_weight * regularizer_loss elif model_name == 'TANR': if i % 10 == 0: writer.add_scalar('Train/BaseLoss', loss.item(), step) writer.add_scalar('Train/TopicClassificationLoss', topic_classification_loss.item(), step) writer.add_scalar( 'Train/TopicBaseRatio', topic_classification_loss.item() / loss.item(), step) loss += config.topic_classification_loss_weight * topic_classification_loss optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() model.get() loss = loss.get().detach().cpu().item() loss_full.append(loss) if i % 10 == 0: writer.add_scalar('Train/Loss', loss, step) if i % config.num_batches_show_loss == 0: #print(loss_full) #print(type(loss_full)) tqdm.write( f"Time {time_since(start_time)}, batches {i}, current loss {loss:.4f}, average loss: {np.mean(loss_full):.4f}, latest average loss: {np.mean(loss_full[-256:]):.4f}" ) if i % config.num_batches_validate == 0: (model if model_name != 'Exp1' else models[0]).eval() val_auc, val_mrr, val_ndcg5, val_ndcg10 = evaluate( model if model_name != 'Exp1' else models[0], './data/val', 200000) (model if model_name != 'Exp1' else models[0]).train() writer.add_scalar('Validation/AUC', val_auc, step) writer.add_scalar('Validation/MRR', val_mrr, step) writer.add_scalar('Validation/nDCG@5', val_ndcg5, step) writer.add_scalar('Validation/nDCG@10', val_ndcg10, step) tqdm.write( f"Time {time_since(start_time)}, batches {i}, validation AUC: {val_auc:.4f}, validation MRR: {val_mrr:.4f}, validation nDCG@5: {val_ndcg5:.4f}, validation nDCG@10: {val_ndcg10:.4f}, " ) early_stop, get_better = early_stopping(-val_auc) if early_stop: tqdm.write('Early stop.') break elif get_better: try: torch.save( { 'model_state_dict': (model if model_name != 'Exp1' else models[0]).state_dict(), 'optimizer_state_dict': (optimizer if model_name != 'Exp1' else optimizefrs[0]).state_dict(), 'step': step, 'early_stop_value': -val_auc }, f"./checkpoint/{model_name}/ckpt-{step}.pth") except OSError as error: print(f"OS error: {error}")
def main( gpu,cfg,args): # Network Builders load_gpu = gpu+args.start_gpu rank = gpu torch.cuda.set_device(load_gpu) dist.init_process_group( backend='nccl', init_method='tcp://127.0.0.1:{}'.format(args.port), world_size=args.gpu_num, rank=rank, timeout=datetime.timedelta(seconds=300)) # self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model).cuda(self.gpu) if args.use_float16: from torch.cuda.amp import autocast as autocast, GradScaler scaler = GradScaler() else: scaler = None autocast = None label_num_=args.num_class net_encoder = ModelBuilder.build_encoder( arch=cfg.MODEL.arch_encoder.lower(), fc_dim=cfg.MODEL.fc_dim, weights=cfg.MODEL.weights_encoder) net_decoder = ModelBuilder.build_decoder( arch=cfg.MODEL.arch_decoder.lower(), fc_dim=cfg.MODEL.fc_dim, num_class=label_num_, weights=cfg.MODEL.weights_decoder) crit = nn.NLLLoss(ignore_index=255) if cfg.MODEL.arch_decoder.endswith('deepsup'): segmentation_module = SegmentationModule( net_encoder, net_decoder, crit, cfg.TRAIN.deep_sup_scale) else: segmentation_module = SegmentationModule( net_encoder, net_decoder, crit) if args.use_clipdataset: dataset_train = BaseDataset_longclip(args,'train') else: dataset_train = BaseDataset( args, 'train' ) sampler_train =torch.utils.data.distributed.DistributedSampler(dataset_train) loader_train = torch.utils.data.DataLoader(dataset_train, batch_size=args.batchsize, shuffle=False,sampler=sampler_train, pin_memory=True, num_workers=args.workers) print('1 Epoch = {} iters'.format(cfg.TRAIN.epoch_iters)) dataset_val = BaseDataset( args, 'val' ) sampler_val =torch.utils.data.distributed.DistributedSampler(dataset_val) loader_val = torch.utils.data.DataLoader(dataset_val, batch_size=args.batchsize, shuffle=False,sampler=sampler_val, pin_memory=True, num_workers=args.workers) # loader_val = torch.utils.data.DataLoader(dataset_val,batch_size=args.batchsize,shuffle=False,num_workers=args.workers) # create loader iterator # load nets into gpu segmentation_module = segmentation_module.cuda(load_gpu) segmentation_module= nn.SyncBatchNorm.convert_sync_batchnorm(segmentation_module) if args.resume_epoch!=0: # if dist.get_rank() == 0: to_load = torch.load(os.path.join('./resume','model_epoch_{}.pth'.format(args.resume_epoch)),map_location=torch.device("cuda:"+str(load_gpu))) from collections import OrderedDict new_state_dict = OrderedDict() for k, v in to_load.items(): name = k[7:] # remove `module.`,表面从第7个key值字符取到最后一个字符,正好去掉了module. new_state_dict[name] = v #新字典的key值对应的value为一一对应的值。 cfg.TRAIN.start_epoch=args.resume_epoch segmentation_module.load_state_dict(new_state_dict) segmentation_module= torch.nn.parallel.DistributedDataParallel( segmentation_module, device_ids=[load_gpu], find_unused_parameters=True) # Set up optimizers # nets = (net_encoder, net_decoder, crit) nets = segmentation_module optimizers = create_optimizers(segmentation_module, cfg) if args.resume_epoch!=0: # if dist.get_rank() == 0: optimizers.load_state_dict(torch.load(os.path.join('./resume','opt_epoch_{}.pth'.format(args.resume_epoch)),map_location=torch.device("cuda:"+str(load_gpu)))) print('resume from epoch {}'.format(args.resume_epoch)) # Main loop history = {'train': {'epoch': [], 'loss': [], 'acc': []}} # test(segmentation_module,loader_val,args) for epoch in range(cfg.TRAIN.start_epoch, cfg.TRAIN.num_epoch): if dist.get_rank() == 0 and epoch==0: checkpoint(nets,optimizers, history, args, epoch+1) print('Epoch {}'.format(epoch)) train(segmentation_module, loader_train, optimizers, history, epoch+1, cfg,args,load_gpu,scaler=scaler,autocast=autocast) ################### # checkpointing if dist.get_rank() == 0 and (epoch+1)%10==0: checkpoint(segmentation_module,optimizers, history, args, epoch+1) if args.validation: test(segmentation_module,loader_val,args) print('Training Done!')
def train(): writer = SummaryWriter( log_dir= f"./runs/{model_name}/{datetime.datetime.now().replace(microsecond=0).isoformat()}{'-' + os.environ['REMARK'] if 'REMARK' in os.environ else ''}" ) if not os.path.exists('checkpoint'): os.makedirs('checkpoint') try: pretrained_word_embedding = torch.from_numpy( np.load('./data/train/pretrained_word_embedding.npy')).float() except FileNotFoundError: pretrained_word_embedding = None if model_name == 'DKN': try: pretrained_entity_embedding = torch.from_numpy( np.load( './data/train/pretrained_entity_embedding.npy')).float() except FileNotFoundError: pretrained_entity_embedding = None try: pretrained_context_embedding = torch.from_numpy( np.load( './data/train/pretrained_context_embedding.npy')).float() except FileNotFoundError: pretrained_context_embedding = None model = Model(config, pretrained_word_embedding, pretrained_entity_embedding, pretrained_context_embedding).to(device) elif model_name == 'Exp1': models = nn.ModuleList([ Model(config, pretrained_word_embedding).to(device) for _ in range(config.ensemble_factor) ]) elif model_name == 'Exp2': model = Model(config).to(device) else: model = Model(config, pretrained_word_embedding).to(device) if model_name != 'Exp1': print(model) else: print(models[0]) dataset = BaseDataset('data/train/behaviors_parsed.tsv', 'data/train/news_parsed.tsv', 'data/train/roberta') print(f"Load training dataset with size {len(dataset)}.") dataloader = iter( DataLoader(dataset, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers, drop_last=True, pin_memory=True)) if model_name != 'Exp1': criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate) else: criterion = nn.NLLLoss() optimizers = [ torch.optim.Adam(model.parameters(), lr=config.learning_rate) for model in models ] start_time = time.time() loss_full = [] exhaustion_count = 0 step = 0 early_stopping = EarlyStopping() checkpoint_dir = os.path.join('./checkpoint', model_name) Path(checkpoint_dir).mkdir(parents=True, exist_ok=True) checkpoint_path = latest_checkpoint(checkpoint_dir) if checkpoint_path is not None: print(f"Load saved parameters in {checkpoint_path}") checkpoint = torch.load(checkpoint_path) early_stopping(checkpoint['early_stop_value']) step = checkpoint['step'] if model_name != 'Exp1': model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) model.train() else: for model in models: model.load_state_dict(checkpoint['model_state_dict']) model.train() for optimizer in optimizers: optimizer.load_state_dict(checkpoint['optimizer_state_dict']) for i in tqdm(range( 1, config.num_epochs * len(dataset) // config.batch_size + 1), desc="Training"): try: minibatch = next(dataloader) except StopIteration: exhaustion_count += 1 tqdm.write( f"Training data exhausted for {exhaustion_count} times after {i} batches, reuse the dataset." ) dataloader = iter( DataLoader(dataset, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers, drop_last=True, pin_memory=True)) minibatch = next(dataloader) step += 1 if model_name == 'LSTUR': y_pred = model(minibatch["user"], minibatch["clicked_news_length"], minibatch["candidate_news"], minibatch["clicked_news"]) elif model_name == 'HiFiArk': y_pred, regularizer_loss = model(minibatch["candidate_news"], minibatch["clicked_news"]) elif model_name == 'TANR': y_pred, topic_classification_loss = model( minibatch["candidate_news"], minibatch["clicked_news"]) elif model_name == 'Exp1': y_preds = [ model(minibatch["candidate_news"], minibatch["clicked_news"]) for model in models ] y_pred_averaged = torch.stack( [F.softmax(y_pred, dim=1) for y_pred in y_preds], dim=-1).mean(dim=-1) y_pred = torch.log(y_pred_averaged) else: y_pred = model(minibatch["candidate_news"], minibatch["clicked_news"]) y = torch.zeros(len(y_pred)).long().to(device) loss = criterion(y_pred, y) if model_name == 'HiFiArk': if i % 10 == 0: writer.add_scalar('Train/BaseLoss', loss.item(), step) writer.add_scalar('Train/RegularizerLoss', regularizer_loss.item(), step) writer.add_scalar('Train/RegularizerBaseRatio', regularizer_loss.item() / loss.item(), step) loss += config.regularizer_loss_weight * regularizer_loss elif model_name == 'TANR': if i % 10 == 0: writer.add_scalar('Train/BaseLoss', loss.item(), step) writer.add_scalar('Train/TopicClassificationLoss', topic_classification_loss.item(), step) writer.add_scalar( 'Train/TopicBaseRatio', topic_classification_loss.item() / loss.item(), step) loss += config.topic_classification_loss_weight * topic_classification_loss loss_full.append(loss.item()) if model_name != 'Exp1': optimizer.zero_grad() else: for optimizer in optimizers: optimizer.zero_grad() loss.backward() if model_name != 'Exp1': optimizer.step() else: for optimizer in optimizers: optimizer.step() if i % 10 == 0: writer.add_scalar('Train/Loss', loss.item(), step) if i % config.num_batches_show_loss == 0: tqdm.write( f"Time {time_since(start_time)}, batches {i}, current loss {loss.item():.4f}, average loss: {np.mean(loss_full):.4f}, latest average loss: {np.mean(loss_full[-256:]):.4f}" ) if i % config.num_batches_validate == 0: (model if model_name != 'Exp1' else models[0]).eval() val_auc, val_mrr, val_ndcg5, val_ndcg10 = evaluate( model if model_name != 'Exp1' else models[0], './data/val', 200000) (model if model_name != 'Exp1' else models[0]).train() writer.add_scalar('Validation/AUC', val_auc, step) writer.add_scalar('Validation/MRR', val_mrr, step) writer.add_scalar('Validation/nDCG@5', val_ndcg5, step) writer.add_scalar('Validation/nDCG@10', val_ndcg10, step) tqdm.write( f"Time {time_since(start_time)}, batches {i}, validation AUC: {val_auc:.4f}, validation MRR: {val_mrr:.4f}, validation nDCG@5: {val_ndcg5:.4f}, validation nDCG@10: {val_ndcg10:.4f}, " ) early_stop, get_better = early_stopping(-val_auc) if early_stop: tqdm.write('Early stop.') break elif get_better: try: torch.save( { 'model_state_dict': (model if model_name != 'Exp1' else models[0]).state_dict(), 'optimizer_state_dict': (optimizer if model_name != 'Exp1' else optimizers[0]).state_dict(), 'step': step, 'early_stop_value': -val_auc }, f"./checkpoint/{model_name}/ckpt-{step}.pth") except OSError as error: print(f"OS error: {error}")
def train(fed_num): VirtualWorker = [] hook = sy.TorchHook(torch) # <-- NEW: hook PyTorch ie add extra functionalities to support Federated Learning for i in range(fed_num): VirtualWorker.append(sy.VirtualWorker(hook, id=str(i))) VirtualWorker = tuple(VirtualWorker) secure_worker = sy.VirtualWorker(hook, id="secure_worker") #bob = sy.VirtualWorker(hook, id="bob") # <-- NEW: define remote worker bob #alice = sy.VirtualWorker(hook, id="alice") # <-- NEW: and alice #celine = sy.VirtualWorker(hook, id="celine") #david = sy.VirtualWorker(hook, id="david") #elsa = sy.VirtualWorker(hook, id="elsa") writer = SummaryWriter( log_dir= f"./runs/{model_name}/{datetime.datetime.now().replace(microsecond=0).isoformat()}{'-' + os.environ['REMARK'] if 'REMARK' in os.environ else ''}" ) if not os.path.exists('checkpoint'): os.makedirs('checkpoint') try: pretrained_word_embedding = torch.from_numpy( np.load('./data/train/pretrained_word_embedding.npy')).float() except FileNotFoundError: pretrained_word_embedding = None model = Model(config, pretrained_word_embedding) print(model) dataset = BaseDataset('./data/train/behaviors_parsed.tsv', './data/train/news_parsed.tsv', './data/train/roberta') print(f"Load training dataset with size {len(dataset)}.") ############################################### ############################################### # In the step we need to tranform the dataset in federated manner #print(dataset) dataloader = sy.FederatedDataLoader(dataset.federate(VirtualWorker), batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers, drop_last=True, pin_memory=True ) ############################################### print(f"The training dataset has been loaded!") #optimizer = torch.optim.SGD(model.parameters(),lr=config.learning_rate) start_time = time.time() loss_full = [] exhaustion_count = 0 step = 0 early_stopping = EarlyStopping() checkpoint_dir = os.path.join('./checkpoint', model_name) Path(checkpoint_dir).mkdir(parents=True, exist_ok=True) checkpoint_path = latest_checkpoint(checkpoint_dir) ''' if checkpoint_path is not None: print(f"Load saved parameters in {checkpoint_path}") checkpoint = torch.load(checkpoint_path) early_stopping(checkpoint['early_stop_value']) step = checkpoint['step'] model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) model.train() ''' #for i in tqdm(range(1,config.num_epochs * len(dataset) // config.batch_size + 1),desc="Training"): for _ in range(config.num_epochs): models = [] criterion = nn.CrossEntropyLoss() for i in range(fed_num): models.append(model.to(device).copy().send(str(i))) #criterions.append(nn.CrossEntropyLoss()) optimizers = [] for i in range(fed_num): optimizers.append( torch.optim.Adam(models[i].parameters(), lr=config.learning_rate) ) for i, (minibatch, target) in enumerate(dataloader): step += 1 minibatch, target = minibatch.to(device), target.to(device) location = minibatch.location predicts = [0 for _ in range(fed_num)] losses = [0 for _ in range(fed_num)] for j in range(fed_num): if VirtualWorker[j] != location: continue else: optimizers[j].zero_grad() predicts[j] = models[j](minibatch) losses[j] = criterion(predicts[j], target) losses[j].backward() optimizers[j].step() losses[j] = losses[j].get().cpu().item() print(losses) loss = np.sum(losses) loss_full.append(loss) if i % 10 == 0: writer.add_scalar('Train/Loss', loss, step) if i % config.num_batches_show_loss == 0: tqdm.write( f"Time {time_since(start_time)}, batches {i}, current loss {loss:.4f}, average loss: {np.mean(loss_full):.4f}, latest average loss: {np.mean(loss_full[-256:]):.4f}" ) if (i % config.num_batches_validate == 0) and (i!=0): with torch.no_grad(): paraDict = model.state_dict() #model_temp = [0 for _ in range(fed_num)] parasDict = [] for k in range(fed_num): #model_temp[k] = models[k].copy().send(secure_worker) models[k].move(secure_worker) parasDict.append(models[k].state_dict()) for name in paraDict: paraDict[name] = parasDict[0][name].clone().get() for index in range(1, fed_num): paraDict[name] += parasDict[index][name].clone().get() paraDict[name] /= fed_num model.load_state_dict(paraDict) #model = model.to(device) models = [] for index in range(fed_num): models.append(model.to(device).copy().send(str(index))) model.eval() val_auc, val_mrr, val_ndcg5, val_ndcg10 = evaluate( model, './data/val', 200000) model.train() writer.add_scalar('Validation/AUC', val_auc, step) writer.add_scalar('Validation/MRR', val_mrr, step) writer.add_scalar('Validation/nDCG@5', val_ndcg5, step) writer.add_scalar('Validation/nDCG@10', val_ndcg10, step) tqdm.write( f"Time {time_since(start_time)}, batches {i}, validation AUC: {val_auc:.4f}, validation MRR: {val_mrr:.4f}, validation nDCG@5: {val_ndcg5:.4f}, validation nDCG@10: {val_ndcg10:.4f}, " ) early_stop, get_better = early_stopping(-val_auc) if early_stop: tqdm.write('Early stop.') break '''
def train(): writer = SummaryWriter( log_dir= f"./runs/{model_name}/{datetime.datetime.now().replace(microsecond=0).isoformat()}-{Config.num_batches_classification:04}-{Config.joint_loss}" ) try: pretrained_word_embedding = torch.from_numpy( np.load('./data/train/pretrained_word_embedding.npy')).float() except FileNotFoundError: pretrained_word_embedding = None model = Model(Config, pretrained_word_embedding, writer).to(device) print(model) dataset = BaseDataset('data/train/behaviors_parsed.tsv', 'data/train/news_parsed.tsv', Config.dataset_attributes) print(f"Load training dataset with size {len(dataset)}.") dataloader = iter( DataLoader(dataset, batch_size=Config.batch_size, shuffle=True, num_workers=Config.num_workers, drop_last=True)) checkpoint_dir = os.path.join('./checkpoint', model_name) Path(checkpoint_dir).mkdir(parents=True, exist_ok=True) if Config.num_batches_classification != 0: step_classification = 0 optimizer = torch.optim.Adam(model.parameters(), lr=Config.learning_rate) start_time = time.time() with tqdm(total=Config.num_batches_classification, desc="Training (classification)") as pbar: for i in range(1, Config.num_batches_classification + 1): try: minibatch = next(dataloader) except StopIteration: dataloader = iter( DataLoader(dataset, batch_size=Config.batch_size, shuffle=True, num_workers=Config.num_workers, drop_last=True)) minibatch = next(dataloader) step_classification += 1 _, topic_classification_loss = model( minibatch["candidate_news"], minibatch["clicked_news"], classification_only=True) writer.add_scalar('Train(classification)/Loss', topic_classification_loss.item(), step_classification) optimizer.zero_grad() topic_classification_loss.backward() optimizer.step() if i % Config.num_batches_show_loss == 0: tqdm.write( f"Time {time_since(start_time)}, batches {i}, current loss {topic_classification_loss.item():.4f}" ) pbar.update(1) loss_full = [] exhaustion_count = 0 step = 0 optimizer = torch.optim.Adam(model.parameters(), lr=Config.learning_rate) start_time = time.time() early_stopping = EarlyStopping() with tqdm(total=Config.num_batches, desc="Training") as pbar: for i in range(1, Config.num_batches + 1): try: minibatch = next(dataloader) except StopIteration: exhaustion_count += 1 tqdm.write( f"Training data exhausted for {exhaustion_count} times after {i} batches, reuse the dataset." ) dataloader = iter( DataLoader(dataset, batch_size=Config.batch_size, shuffle=True, num_workers=Config.num_workers, drop_last=True)) minibatch = next(dataloader) step += 1 y_pred, topic_classification_loss = model( minibatch["candidate_news"], minibatch["clicked_news"]) loss = torch.stack([x[0] for x in -F.log_softmax(y_pred, dim=1) ]).mean() if Config.joint_loss: writer.add_scalar('Train/BaseLoss', loss.item(), step) writer.add_scalar('Train/TopicClassificationLoss', topic_classification_loss.item(), step) writer.add_scalar( 'Train/TopicBaseRatio', topic_classification_loss.item() / loss.item(), step) loss += Config.topic_classification_loss_weight * topic_classification_loss loss_full.append(loss.item()) optimizer.zero_grad() loss.backward() optimizer.step() writer.add_scalar('Train/Loss', loss.item(), step) if i % Config.num_batches_show_loss == 0: tqdm.write( f"Time {time_since(start_time)}, batches {i}, current loss {loss.item():.4f}, average loss: {np.mean(loss_full):.4f}" ) if i % Config.num_batches_validate == 0: val_auc, val_mrr, val_ndcg5, val_ndcg10 = evaluate( model, './data/val') writer.add_scalar('Validation/AUC', val_auc, step) writer.add_scalar('Validation/MRR', val_mrr, step) writer.add_scalar('Validation/nDCG@5', val_ndcg5, step) writer.add_scalar('Validation/nDCG@10', val_ndcg10, step) tqdm.write( f"Time {time_since(start_time)}, batches {i}, validation AUC: {val_auc:.4f}, validation MRR: {val_mrr:.4f}, validation nDCG@5: {val_ndcg5:.4f}, validation nDCG@10: {val_ndcg10:.4f}, " ) early_stop, get_better = early_stopping(-val_auc) if early_stop: tqdm.write('Early stop.') break elif get_better: torch.save({'model_state_dict': model.state_dict()}, f"./checkpoint/{model_name}/ckpt-{step}.pth") pbar.update(1)
def __init__(self, opt): self.opt = opt print(opt.dir + "/data.txt") assert os.path.exists( opt.dir + "/data.txt"), "No data.txt found in specified dir" assert os.path.exists( opt.dir + "/label.txt"), "No label.txt found in specified dir" train_dir = opt.data_dir + "/TrainSet/" val_dir = opt.data_dir + "/ValidateSet/" test_dir = opt.data_dir + "/TestSet/" # split data if not all([ os.path.exists(train_dir), os.path.exists(val_dir), os.path.exists(test_dir) ]): # rm existing directories rmdir(train_dir) rmdir(val_dir) rmdir(test_dir) # split data to Train, Val, Test logging.info("Split raw data to Train, Val and Test") ratios = opt.ratio dataset = collections.defaultdict(list) with open(opt.dir + '/data.txt') as d: for line in d.readlines(): line = json.loads(line) # if data has been specified data_type yet, load data as what was specified before if line.has_key("type"): dataset[line["type"]].append(line) continue # specified data_type randomly rand = random.random() if rand < ratios[0]: data_type = "Train" elif rand < ratios[0] + ratios[1]: data_type = "Validate" else: data_type = "Test" dataset[data_type].append(line) # write to file self._WriteDataToFile(dataset["Train"], train_dir) self._WriteDataToFile(dataset["Validate"], val_dir) self._WriteDataToFile(dataset["Test"], test_dir) self.rid2name, self.id2rid, self.rid2id = load_label(opt.dir + '/label.txt') self.num_classes = [len(item) - 2 for item in self.rid2name] # load dataset if opt.mode == "Train": logging.info("Load Train Dataset...") self.train_set = BaseDataset(self.opt, "TrainSet", self.rid2id) logging.info("Load Validate Dataset...") self.val_set = BaseDataset(self.opt, "ValidateSet", self.rid2id) else: # force batch_size for test to 1 self.opt.batch_size = 1 self.opt.load_thread = 1 logging.info("Load Test Dataset...") self.test_set = BaseDataset(self.opt, "TestSet", self.rid2id)
BR_SERIE_B = DatasetAggregator(ObservationDataset('leagues/br_serie_b.json')) BR_SERIE_A = DatasetAggregator(ObservationDataset('leagues/br_serie_a.json')) BAHRAIN_PL = DatasetAggregator(ObservationDataset('leagues/bahrain_pl.json')) BEL_PL = DatasetAggregator(ObservationDataset('leagues/bel_pl.json')) K_LEAGUE = DatasetAggregator(ObservationDataset('leagues/k_league.json')) K_LEAGUE2 = DatasetAggregator(ObservationDataset('leagues/k_league2.json')) COSTA_RICA = DatasetAggregator(ObservationDataset('leagues/costa_rica_primera.json')) NB_1_LIGA = DatasetAggregator(ObservationDataset('leagues/nb_1_liga.json')) Eliteserien = DatasetAggregator(ObservationDataset('leagues/eliteserien.json')) Allsvenskan = DatasetAggregator(ObservationDataset('leagues/allsvenskan.json')) CHINA_SUPER_LEAGUE = DatasetAggregator(ObservationDataset('leagues/china_super_league.json')) FA_CUP = DatasetAggregator( ObservationDataset('cups/fa_cup.json'), FeatureDataset([ BaseDataset.from_file('leagues/epl.json', {'strength': 0}), BaseDataset.from_file('leagues/efl_championship.json', {'strength': 1}), BaseDataset.from_file('leagues/efl_league1.json', {'strength': 2}), BaseDataset.from_file('leagues/efl_league2.json', {'strength': 3}) ]) ) LEAGUE_CUP = DatasetAggregator( ObservationDataset('cups/league_cup.json'), FeatureDataset([ BaseDataset.from_file('leagues/epl.json', {'strength': 0}), BaseDataset.from_file('leagues/efl_championship.json', {'strength': 1}), BaseDataset.from_file('leagues/efl_league1.json', {'strength': 2}), BaseDataset.from_file('leagues/efl_league2.json', {'strength': 3}) ]) )