コード例 #1
0
ファイル: evaluation.py プロジェクト: matwoess/sed_framework
    def evaluate(self) -> None:
        # create datasets and loaders
        dev_dataset = BaseDataset(self.feature_type, self.scene,
                                  self.hyper_params, self.fft_params)
        eval_dataset = BaseDataset(self.feature_type,
                                   self.scene,
                                   self.hyper_params,
                                   self.fft_params,
                                   data_path=os.path.join('data', 'eval'))
        dev_set = ExcerptDataset(dev_dataset,
                                 self.feature_type,
                                 self.classes,
                                 self.hyper_params['excerpt_size'],
                                 self.fft_params,
                                 overlap_factor=1,
                                 rnd_augment=False)
        eval_set = ExcerptDataset(eval_dataset,
                                  self.feature_type,
                                  self.classes,
                                  self.hyper_params['excerpt_size'],
                                  self.fft_params,
                                  overlap_factor=1,
                                  rnd_augment=False)
        dev_loader = DataLoader(dev_set,
                                batch_size=1,
                                shuffle=False,
                                num_workers=0)
        eval_loader = DataLoader(eval_set,
                                 batch_size=1,
                                 shuffle=False,
                                 num_workers=0)

        # evaluate on all individual files for both dev and eval set
        eval_loss, metrics_eval, metrics_pp_eval = self.evaluate_model_on_files(
            eval_loader)
        dev_loss, metrics_dev, metrics_pp_dev = self.evaluate_model_on_files(
            dev_loader)

        # write results to files and log parameters
        self.write_losses(dev_loss, eval_loss)
        self.write_metrics(metrics_dev, metrics_eval, metrics_pp_dev,
                           metrics_pp_eval)
        filtered_results = self.log_params(metrics_eval, metrics_pp_eval)
        print(
            f'final eval ER: {filtered_results["final_metric/segment_based/overall/ER"]}'
        )
        print(
            f'final eval F: {filtered_results["final_metric/segment_based/overall/F"]}'
        )
        print(
            f'final eval ER (post-processed): {filtered_results["final_metric_pp/segment_based/overall/ER"]}'
        )
        print(
            f'final eval F (post-processed): {filtered_results["final_metric_pp/segment_based/overall/F"]}'
        )
コード例 #2
0
def get_train_dataset(config):
    dataset = BaseDataset(config.train_dir + '/behaviors_parsed.tsv',
                          config.train_dir + '/news_parsed.tsv',
                          config.dataset_attributes)

    print(f"Load training dataset with size {len(dataset)}.")
    return dataset
コード例 #3
0
    def __init__(self, cfg):
        self.cfg = cfg
        self.Image_generator = U_Net(in_ch=3, out_ch=cfg.DATASET.N_CLASS,
                                     norm=torch.nn.BatchNorm2d, side='no')

        train_dataset = BaseDataset(cfg, split='train')
        valid_dataset = BaseDataset(cfg, split='val')
        self.train_dataloader = data.DataLoader(train_dataset, batch_size=cfg.DATASET.BATCHSIZE,
                                                num_workers=8, shuffle=True, drop_last=True)
        self.valid_dataloader = data.DataLoader(valid_dataset, batch_size=cfg.DATASET.BATCHSIZE,
                                                num_workers=8, shuffle=True, drop_last=True)

        self.criterion = torch.nn.CrossEntropyLoss(ignore_index=self.cfg.LOSS.IGNORE_INDEX, weight=torch.tensor([1,0.5,0.5,1,3,1,1,1,1]).cuda())

        self.ckpt_outdir = os.path.join(cfg.TRAIN.OUTDIR, 'checkpoints')
        if not os.path.isdir(self.ckpt_outdir):
            os.mkdir(self.ckpt_outdir)
        self.val_outdir = os.path.join(cfg.TRAIN.OUTDIR, 'val')
        if not os.path.isdir(self.val_outdir):
            os.mkdir(self.val_outdir)
        self.start_epoch = cfg.TRAIN.RESUME
        self.n_epoch = cfg.TRAIN.N_EPOCH

        self.optimizer = torch.optim.Adam([{'params':self.Image_generator.parameters()}],
                                           lr=cfg.OPTIMIZER.G_LR,
                                           betas=(cfg.OPTIMIZER.BETA1, cfg.OPTIMIZER.BETA2),
                                           weight_decay=cfg.OPTIMIZER.WEIGHT_DECAY)

        iter_per_epoch = len(train_dataset)//cfg.DATASET.BATCHSIZE
        lambda_poly = lambda iters: pow((1.0 - iters / (cfg.TRAIN.N_EPOCH*iter_per_epoch)), 0.9)
        self.scheduler = torch.optim.lr_scheduler.LambdaLR(self.optimizer, lr_lambda=lambda_poly,)

        self.logger = logger(cfg.TRAIN.OUTDIR, name='train')
        self.running_metrics = runningScore(n_classes=cfg.DATASET.N_CLASS)

        if self.start_epoch >= 0:
            self.Image_generator.load_state_dict(
                torch.load(os.path.join(cfg.TRAIN.OUTDIR, 'checkpoints', '{}epoch.pth'.format(self.start_epoch)))['model'])
            self.optimizer.load_state_dict(
                torch.load(os.path.join(cfg.TRAIN.OUTDIR, 'checkpoints', '{}epoch.pth'.format(self.start_epoch)))['optimizer'])

            log = "Using the {}th checkpoint".format(self.start_epoch)
            self.logger.info(log)
        self.Image_generator = self.Image_generator.cuda()
        self.criterion = self.criterion.cuda()
コード例 #4
0
ファイル: train.py プロジェクト: baeyuna97/SNU_DLab
def train():
    writer = SummaryWriter(
        log_dir=
        f"../runs/{model_name}/{datetime.datetime.now().replace(microsecond=0).isoformat()}{'-' + os.environ['REMARK'] if 'REMARK' in os.environ else ''}"
    )

    if not os.path.exists('checkpoint'):
        os.makedirs('checkpoint')

    try:
        pretrained_word_embedding = torch.from_numpy(
            np.load('../data/train/pretrained_word_embedding.npy')).float()
    except FileNotFoundError:
        pretrained_word_embedding = None

    if model_name == 'DKN':
        try:
            pretrained_entity_embedding = torch.from_numpy(
                np.load(
                    '../data/train/pretrained_entity_embedding.npy')).float()
        except FileNotFoundError:
            pretrained_entity_embedding = None

        try:
            pretrained_context_embedding = torch.from_numpy(
                np.load(
                    '../data/train/pretrained_context_embedding.npy')).float()
        except FileNotFoundError:
            pretrained_context_embedding = None

        model = Model(config, pretrained_word_embedding,
                      pretrained_entity_embedding,
                      pretrained_context_embedding, writer).to(device)
    else:
        model = Model(config, pretrained_word_embedding, writer).to(device)

    print(model)

    dataset = BaseDataset('../data/train/behaviors_parsed.tsv',
                          '../data/train/news_parsed.tsv',
                          config.dataset_attributes)

    print(f"Load training dataset with size {len(dataset)}.")

    dataloader = iter(
        DataLoader(dataset,
                   batch_size=config.batch_size,
                   shuffle=True,
                   num_workers=config.num_workers,
                   drop_last=True))

    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    start_time = time.time()
    loss_full = []
    exhaustion_count = 0
    step = 0
    early_stopping = EarlyStopping()

    checkpoint_dir = os.path.join('../checkpoint', model_name)
    Path(checkpoint_dir).mkdir(parents=True, exist_ok=True)

    checkpoint_path = latest_checkpoint(checkpoint_dir)
    if checkpoint_path is not None:
        print(f"Load saved parameters in {checkpoint_path}")
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        step = checkpoint['step']
        early_stopping(checkpoint['early_stop_value'])
        model.train()

    with tqdm(total=config.num_batches, desc="Training") as pbar:
        for i in range(1, config.num_batches + 1):
            try:
                minibatch = next(dataloader)
            except StopIteration:
                exhaustion_count += 1
                tqdm.write(
                    f"Training data exhausted for {exhaustion_count} times after {i} batches, reuse the dataset."
                )
                dataloader = iter(
                    DataLoader(dataset,
                               batch_size=config.batch_size,
                               shuffle=True,
                               num_workers=config.num_workers,
                               drop_last=True))
                minibatch = next(dataloader)

            step += 1
            if model_name == 'LSTUR':
                y_pred = model(minibatch["user"],
                               minibatch["clicked_news_length"],
                               minibatch["candidate_news"],
                               minibatch["clicked_news"])
            elif model_name == 'HiFiArk':
                y_pred, regularizer_loss = model(minibatch["candidate_news"],
                                                 minibatch["clicked_news"])
            elif model_name == 'TANR':
                y_pred, topic_classification_loss = model(
                    minibatch["candidate_news"], minibatch["clicked_news"])
            else:
                y_pred = model(minibatch["candidate_news"],
                               minibatch["clicked_news"])

            loss = torch.stack([x[0] for x in -F.log_softmax(y_pred, dim=1)
                                ]).mean()
            if model_name == 'HiFiArk':
                if i % 10 == 0:
                    writer.add_scalar('Train/BaseLoss', loss.item(), step)
                    writer.add_scalar('Train/RegularizerLoss',
                                      regularizer_loss.item(), step)
                    writer.add_scalar('Train/RegularizerBaseRatio',
                                      regularizer_loss.item() / loss.item(),
                                      step)
                loss += config.regularizer_loss_weight * regularizer_loss
            elif model_name == 'TANR':
                if i % 10 == 0:
                    writer.add_scalar('Train/BaseLoss', loss.item(), step)
                    writer.add_scalar('Train/TopicClassificationLoss',
                                      topic_classification_loss.item(), step)
                    writer.add_scalar(
                        'Train/TopicBaseRatio',
                        topic_classification_loss.item() / loss.item(), step)
                loss += config.topic_classification_loss_weight * topic_classification_loss
            loss_full.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if i % 10 == 0:
                writer.add_scalar('Train/Loss', loss.item(), step)

            if i % config.num_batches_show_loss == 0:
                tqdm.write(
                    f"Time {time_since(start_time)}, batches {i}, current loss {loss.item():.4f}, average loss: {np.mean(loss_full):.4f}"
                )

            if i % config.num_batches_validate == 0:
                val_auc, val_mrr, val_ndcg5, val_ndcg10 = evaluate(
                    model, '../data/val')
                writer.add_scalar('Validation/AUC', val_auc, step)
                writer.add_scalar('Validation/MRR', val_mrr, step)
                writer.add_scalar('Validation/nDCG@5', val_ndcg5, step)
                writer.add_scalar('Validation/nDCG@10', val_ndcg10, step)
                tqdm.write(
                    f"Time {time_since(start_time)}, batches {i}, validation AUC: {val_auc:.4f}, validation MRR: {val_mrr:.4f}, validation nDCG@5: {val_ndcg5:.4f}, validation nDCG@10: {val_ndcg10:.4f}, "
                )

                early_stop, get_better = early_stopping(-val_auc)
                if early_stop:
                    tqdm.write('Early stop.')
                    break
                elif get_better:
                    torch.save(
                        {
                            'model_state_dict': model.state_dict(),
                            'optimizer_state_dict': optimizer.state_dict(),
                            'step': step,
                            'early_stop_value': -val_auc
                        }, f"../checkpoint/{model_name}/ckpt-{step}.pth")

            pbar.update(1)
コード例 #5
0
def extraction(cfg):
    # cpu or gpu?
    if torch.cuda.is_available() and cfg.device is not None:
        device = torch.device(cfg.device)
    else:
        if not torch.cuda.is_available():
            print("hey man, buy a GPU!")
        device = torch.device("cpu")

    dataset = BaseDataset(path=cfg.dataset_path,
                          dataset=cfg.dataset,
                          mode=cfg.mode)
    data_loader = DataLoader(dataset,
                             cfg.batch_size,
                             shuffle=False,
                             num_workers=cfg.num_workers)

    if cfg.model_type == 'UniNet':
        featnet = FeatNet()
        featnet.load_state_dict(
            torch.load(cfg.featnet_path, map_location=device))
        featnet.to(device)
        masknet = MaskNet()
        masknet.load_state_dict(
            torch.load(cfg.masknet_path, map_location=device))
        masknet.to(device)

        with torch.no_grad():
            featnet.eval()
            masknet.eval()
            labels = []
            img_names = []
            labels_vec = np.zeros((len(dataset), dataset.class_num))
            features = np.zeros((len(dataset), 64, 512))
            masks = np.zeros((len(dataset), 3, 64, 512))
            didx = -1
            for img_batch, label_batch, label_vec_batch, img_name_batch in tqdm(
                    data_loader, ncols=80, ascii=True):
                didx += 1
                img_batch = img_batch.to(device)
                feature_batch = featnet(img_batch)
                mask_batch = masknet(img_batch)
                for idx in range(feature_batch.shape[0]):
                    labels_vec[idx + didx, :] = label_vec_batch[idx, :].numpy()
                    labels.append(label_batch[idx])
                    img_names.append(img_name_batch[idx])
                    features[idx +
                             didx, :, :] = feature_batch[idx].cpu().numpy()
                    masks[idx + didx, :2, :, :] = mask_batch[idx].cpu().numpy()
                    mask = F.softmax(mask_batch[idx], dim=0).cpu().numpy()
                    masks[idx + didx, 2, :, :] = mask[0] < mask[1]

    else:
        if cfg.model_type == 'maxout-feature':
            model = Maxout_feature()
        elif cfg.model_type == 'facenet':
            model = FaceModel(256)
        model.load_state_dict(torch.load(cfg.model_path, map_location=device))
        model.to(device)

        with torch.no_grad():
            model.eval()
            labels = []
            img_names = []
            labels_vec = np.zeros((len(dataset), dataset.class_num))
            features = np.zeros((len(dataset), 64, 512))
            masks = np.ones((len(dataset), 3, 64, 512))
            didx = -1
            for img_batch, label_batch, label_vec_batch, img_name_batch in tqdm(
                    data_loader, ncols=80, ascii=True):
                didx += 1
                img_batch = img_batch.to(device)
                feature_batch = model(img_batch)
                for idx in range(feature_batch.shape[0]):
                    labels_vec[idx + didx, :] = label_vec_batch[idx, :].numpy()
                    labels.append(label_batch[idx])
                    img_names.append(img_name_batch[idx])
                    features[idx +
                             didx, :, :] = feature_batch[idx].cpu().numpy()

    if cfg.save == 'mat':
        ft_path = 'feature/{}__{}.mat'.format(cfg.model, cfg.dataset)
        ft_load = {
            'features': features,
            'masks': masks,
            'labels_vec': labels_vec,
            'labels': labels
        }
        savemat(ft_path, ft_load)
    elif cfg.save == 'pic':
        if not os.path.exists('feature/{}__{}'.format(cfg.model, cfg.dataset)):
            os.makedirs('feature/{}__{}'.format(cfg.model, cfg.dataset))
        for idx in range(len(dataset)):
            feature_img = features[idx, :, :]
            feature_img = (feature_img - feature_img.min()) / (
                feature_img.max() - feature_img.min())
            Image.fromarray(feature_img * 255).convert('L').save(
                'feature/{}__{}/{}_feature.png'.format(cfg.model, cfg.dataset,
                                                       img_names[idx]))
            Image.fromarray(masks[idx, 2, :, :] * 255).convert('L').save(
                'feature/{}__{}/{}_mask.png'.format(cfg.model, cfg.dataset,
                                                    img_names[idx]))

    return features, masks, labels, labels_vec
コード例 #6
0
ファイル: main.py プロジェクト: matwoess/sed_framework
def main(eval_mode: bool, feature_type: str, scene: str, hyper_params: dict,
         network_config: dict, eval_settings: dict, fft_params: dict) -> None:
    """
    Main function that takes hyper-parameters, creates the architecture, trains the model and evaluates it
    """
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    os.makedirs('results', exist_ok=True)
    experiment_id = datetime.now().strftime(
        "%Y%m%d-%H%M%S") + f' - {feature_type} - {scene}'
    writer = SummaryWriter(log_dir=os.path.join('tensorboard', experiment_id))
    shutil.copyfile('config.json', os.path.join(
        'results', 'config.json'))  # save current config file to results
    training_dataset = BaseDataset(feature_type, scene, hyper_params,
                                   fft_params)
    # create network
    classes = util.get_scene_classes(scene)
    plotter = Plotter(classes,
                      hop_size=fft_params['hop_size'],
                      sampling_rate=22050)
    # finalize network config parameters
    network_config['out_features'] = len(classes)
    if feature_type == 'spec':
        network_config['n_features'] = fft_params['n_fft'] // 2 + 1
    elif feature_type == 'mfcc':
        network_config['n_features'] = fft_params['n_mfcc']
    elif feature_type == 'mels':
        network_config['n_features'] = fft_params['n_mels']
    # create network
    net = SimpleCNN(**network_config)
    # Save initial model as "best" model (will be overwritten later)
    model_path = os.path.join('results',
                              f'best_{feature_type}_{scene}_model.pt')
    if not os.path.exists(model_path):
        torch.save(net, model_path)
    else:  # if there already exists a model, just load parameters
        print(f'reusing pre-trained model: "{model_path}"')
        net = torch.load(model_path, map_location=torch.device('cpu'))
    net.to(device)
    # get loss function
    loss_fn = torch.nn.BCELoss()
    # create adam optimizer
    optimizer = torch.optim.Adam(net.parameters(),
                                 lr=hyper_params['learning_rate'],
                                 weight_decay=hyper_params['weight_decay'])

    train_stats_at = eval_settings['train_stats_at']
    validate_at = eval_settings['validate_at']
    best_loss = np.inf  # best validation loss so far
    progress_bar = tqdm.tqdm(total=hyper_params['n_updates'],
                             desc=f"loss: {np.nan:7.5f}",
                             position=0)
    update = 0  # current update counter

    fold_idx = 1  # one random fold (defines split into training and validation set)
    rnd_augment = hyper_params['rnd_augment']
    # create subsets and data loaders
    if eval_mode:
        train_subset = training_dataset
        val_loader = None
    else:
        train_subset = Subset(training_dataset,
                              training_dataset.get_fold_indices(fold_idx)[0])
        val_subset = Subset(training_dataset,
                            training_dataset.get_fold_indices(fold_idx)[1])
        val_set = ExcerptDataset(val_subset,
                                 feature_type,
                                 classes,
                                 hyper_params['excerpt_size'],
                                 fft_params,
                                 overlap_factor=1,
                                 rnd_augment=False)
        val_loader = DataLoader(val_set,
                                batch_size=hyper_params['batch_size'],
                                shuffle=False,
                                num_workers=0)

    train_set = ExcerptDataset(
        train_subset,
        feature_type,
        classes,
        hyper_params['excerpt_size'],
        fft_params,
        overlap_factor=hyper_params['train_overlap_factor'],
        rnd_augment=rnd_augment)
    train_loader = DataLoader(train_set,
                              batch_size=hyper_params['batch_size'],
                              shuffle=True,
                              num_workers=0)

    n_updates = hyper_params['n_updates']
    # main training loop
    while update <= n_updates:
        if rnd_augment and update > 0:
            # regenerate new excerpts (in background) but use current ones for training
            train_set.generate_excerpts()
        for data in train_loader:
            inputs, targets, audio_file, idx = data
            inputs = inputs.to(device, dtype=torch.float32)
            targets = targets.to(device, dtype=torch.float32)
            optimizer.zero_grad()
            predictions = net(inputs)
            loss = loss_fn(predictions, targets)
            loss.backward()
            optimizer.step()

            if update % train_stats_at == 0 and update > 0:
                # log training loss
                writer.add_scalar(tag="training/loss",
                                  scalar_value=loss.cpu(),
                                  global_step=update)

            if not eval_mode and update % validate_at == 0 and update > 0:
                # evaluate model on validation set, log parameters and metrics
                val_loss, metrics, metrics_pp = validate_model(
                    net, val_loader, classes, update, device, plotter)
                print(f'val_loss: {val_loss}')
                f_score = metrics['segment_based']['overall']['F']
                err_rate = metrics['segment_based']['overall']['ER']
                f_score_pp = metrics_pp['segment_based']['overall']['F']
                err_rate_pp = metrics_pp['segment_based']['overall']['ER']
                print(f'f_score: {f_score}')
                print(f'err_rate: {err_rate}')
                print(f'f_score_pp: {f_score_pp}')
                print(f'err_rate_pp: {err_rate_pp}')
                params = net.parameters()
                log_validation_params(writer, val_loss, params, metrics,
                                      metrics_pp, update)
                # Save best model for early stopping
                if val_loss < best_loss:
                    print(
                        f'{val_loss} < {best_loss}... saving as new {os.path.split(model_path)[-1]}'
                    )
                    best_loss = val_loss
                    torch.save(net, model_path)

            if eval_mode:
                # in eval mode, just compare train_loss
                train_loss = loss.cpu()
                if train_loss < best_loss:
                    print(
                        f'{train_loss} < {best_loss}... saving as new {os.path.split(model_path)[-1]}'
                    )
                    best_loss = train_loss
                    torch.save(net, model_path)

            # update progress and update-counter
            progress_bar.set_description(f"loss: {loss:7.5f}", refresh=True)
            progress_bar.update()
            update += 1
            if update >= n_updates:
                break

    progress_bar.close()
    print('finished training.')

    print('starting evaluation...')
    evaluator = evaluation.Evaluator(feature_type, scene, hyper_params,
                                     network_config, fft_params, model_path,
                                     device, writer, plotter)
    evaluator.evaluate()
    print('zipping "results" folder...')
    util.zip_folder('results', f'results_{feature_type}_{scene}')
コード例 #7
0
    def __init__(self, cfg):
        self.cfg = cfg
        self.OldLabel_generator = U_Net(in_ch=cfg.DATASET.N_CLASS,
                                        out_ch=cfg.DATASET.N_CLASS,
                                        side='out')
        self.Image_generator = U_Net(in_ch=3,
                                     out_ch=cfg.DATASET.N_CLASS,
                                     side='in')
        self.discriminator = Discriminator(cfg.DATASET.N_CLASS + 3,
                                           cfg.DATASET.IMGSIZE,
                                           patch=True)

        self.criterion_G = GeneratorLoss(cfg.LOSS.LOSS_WEIGHT[0],
                                         cfg.LOSS.LOSS_WEIGHT[1],
                                         cfg.LOSS.LOSS_WEIGHT[2],
                                         ignore_index=cfg.LOSS.IGNORE_INDEX)
        self.criterion_D = DiscriminatorLoss()

        train_dataset = BaseDataset(cfg, split='train')
        valid_dataset = BaseDataset(cfg, split='val')
        self.train_dataloader = data.DataLoader(
            train_dataset,
            batch_size=cfg.DATASET.BATCHSIZE,
            num_workers=8,
            shuffle=True,
            drop_last=True)
        self.valid_dataloader = data.DataLoader(
            valid_dataset,
            batch_size=cfg.DATASET.BATCHSIZE,
            num_workers=8,
            shuffle=True,
            drop_last=True)

        self.ckpt_outdir = os.path.join(cfg.TRAIN.OUTDIR, 'checkpoints')
        if not os.path.isdir(self.ckpt_outdir):
            os.mkdir(self.ckpt_outdir)
        self.val_outdir = os.path.join(cfg.TRAIN.OUTDIR, 'val')
        if not os.path.isdir(self.val_outdir):
            os.mkdir(self.val_outdir)
        self.start_epoch = cfg.TRAIN.RESUME
        self.n_epoch = cfg.TRAIN.N_EPOCH

        self.optimizer_G = torch.optim.Adam(
            [{
                'params': self.OldLabel_generator.parameters()
            }, {
                'params': self.Image_generator.parameters()
            }],
            lr=cfg.OPTIMIZER.G_LR,
            betas=(cfg.OPTIMIZER.BETA1, cfg.OPTIMIZER.BETA2),
            # betas=(cfg.OPTIMIZER.BETA1, cfg.OPTIMIZER.BETA2),
            weight_decay=cfg.OPTIMIZER.WEIGHT_DECAY)

        self.optimizer_D = torch.optim.Adam(
            [{
                'params': self.discriminator.parameters(),
                'initial_lr': cfg.OPTIMIZER.D_LR
            }],
            lr=cfg.OPTIMIZER.D_LR,
            betas=(cfg.OPTIMIZER.BETA1, cfg.OPTIMIZER.BETA2),
            # betas=(cfg.OPTIMIZER.BETA1, cfg.OPTIMIZER.BETA2),
            weight_decay=cfg.OPTIMIZER.WEIGHT_DECAY)

        iter_per_epoch = len(train_dataset) // cfg.DATASET.BATCHSIZE
        lambda_poly = lambda iters: pow(
            (1.0 - iters / (cfg.TRAIN.N_EPOCH * iter_per_epoch)), 0.9)
        self.scheduler_G = torch.optim.lr_scheduler.LambdaLR(
            self.optimizer_G,
            lr_lambda=lambda_poly,
        )
        # last_epoch=(self.start_epoch+1)*iter_per_epoch)
        self.scheduler_D = torch.optim.lr_scheduler.LambdaLR(
            self.optimizer_D,
            lr_lambda=lambda_poly,
        )
        # last_epoch=(self.start_epoch+1)*iter_per_epoch)

        self.logger = logger(cfg.TRAIN.OUTDIR, name='train')
        self.running_metrics = runningScore(n_classes=cfg.DATASET.N_CLASS)

        if self.start_epoch >= 0:
            self.OldLabel_generator.load_state_dict(
                torch.load(
                    os.path.join(cfg.TRAIN.OUTDIR, 'checkpoints',
                                 '{}epoch.pth'.format(
                                     self.start_epoch)))['model_G_N'])
            self.Image_generator.load_state_dict(
                torch.load(
                    os.path.join(cfg.TRAIN.OUTDIR, 'checkpoints',
                                 '{}epoch.pth'.format(
                                     self.start_epoch)))['model_G_I'])
            self.discriminator.load_state_dict(
                torch.load(
                    os.path.join(cfg.TRAIN.OUTDIR, 'checkpoints',
                                 '{}epoch.pth'.format(
                                     self.start_epoch)))['model_D'])
            self.optimizer_G.load_state_dict(
                torch.load(
                    os.path.join(cfg.TRAIN.OUTDIR, 'checkpoints',
                                 '{}epoch.pth'.format(
                                     self.start_epoch)))['optimizer_G'])
            self.optimizer_D.load_state_dict(
                torch.load(
                    os.path.join(cfg.TRAIN.OUTDIR, 'checkpoints',
                                 '{}epoch.pth'.format(
                                     self.start_epoch)))['optimizer_D'])

            log = "Using the {}th checkpoint".format(self.start_epoch)
            self.logger.info(log)
        self.Image_generator = self.Image_generator.cuda()
        self.OldLabel_generator = self.OldLabel_generator.cuda()
        self.discriminator = self.discriminator.cuda()
        self.criterion_G = self.criterion_G.cuda()
        self.criterion_D = self.criterion_D.cuda()
コード例 #8
0
def train():
    writer = SummaryWriter(
        log_dir=
        f"./runs/{model_name}/{datetime.datetime.now().replace(microsecond=0).isoformat()}{'-' + os.environ['REMARK'] if 'REMARK' in os.environ else ''}"
    )

    if not os.path.exists('checkpoint'):
        os.makedirs('checkpoint')

    try:
        pretrained_word_embedding = torch.from_numpy(
            np.load('./data/train/pretrained_word_embedding.npy')).float()
    except FileNotFoundError:
        pretrained_word_embedding = None

    if model_name == 'DKN':
        try:
            pretrained_entity_embedding = torch.from_numpy(
                np.load(
                    './data/train/pretrained_entity_embedding.npy')).float()
        except FileNotFoundError:
            pretrained_entity_embedding = None

        try:
            pretrained_context_embedding = torch.from_numpy(
                np.load(
                    './data/train/pretrained_context_embedding.npy')).float()
        except FileNotFoundError:
            pretrained_context_embedding = None

        model = Model(config, pretrained_word_embedding,
                      pretrained_entity_embedding,
                      pretrained_context_embedding).to(device)
    else:
        model = Model(config, pretrained_word_embedding).to(device)

    print(model)

    dataset = BaseDataset('./data/train/behaviors_parsed.tsv',
                          './data/train/news_parsed.tsv',
                          './data/train/roberta')

    print(f"Load training dataset with size {len(dataset)}.")
    ###############################################
    '''
    dataloader = DataLoader(dataset,
                   batch_size=config.batch_size,
                   shuffle=True,
                   num_workers=config.num_workers,
                   drop_last=True,
                   pin_memory=True)'''
    ###############################################
    # In the step we need to tranform the dataset in federated manner
    '''
    federated_train_loader = sy.FederatedDataLoader(datasets.MNIST(
                                                            '../data', 
                                                            train=True, 
                                                            download=True,
                                                            transform=transforms.Compose(
                                                                            [transforms.ToTensor(),
                                                                             transforms.Normalize((0.1307,), (0.3081,))]
                                                                            )
                                                                    )
    federated_train_loader = sy.FederatedDataLoader( # <-- this is now a FederatedDataLoader 
                                        dataset.federate((bob, alice)), # <-- NEW: we distribute the dataset across all the workers, it's now a FederatedDataset
                                        batch_size=args.batch_size, 
                                        shuffle=True, **kwargs)
    dataloader = iter(sy.FederatedDataLoader(dataset.federate((bob, alice)),
                                            batch_size=config.batch_size,
                                            shuffle=True,
                                            #num_workers=config.num_workers,
                                            drop_last=True,
                                            #pin_memory=True
                                           ))
                                        '''
    #print(dataset)
    dataloader = sy.FederatedDataLoader(dataset.federate((bob, alice)),
                                        batch_size=config.batch_size,
                                        shuffle=True,
                                        num_workers=config.num_workers,
                                        drop_last=True,
                                        pin_memory=True)
    ###############################################
    print(f"The training dataset has been loaded!")
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=config.learning_rate)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=3,
                                                gamma=0.95,
                                                last_epoch=-1)
    start_time = time.time()
    loss_full = []
    exhaustion_count = 0
    step = 0
    early_stopping = EarlyStopping()

    checkpoint_dir = os.path.join('./checkpoint', model_name)
    Path(checkpoint_dir).mkdir(parents=True, exist_ok=True)

    checkpoint_path = latest_checkpoint(checkpoint_dir)
    '''
    if checkpoint_path is not None:
        print(f"Load saved parameters in {checkpoint_path}")
        checkpoint = torch.load(checkpoint_path)
        early_stopping(checkpoint['early_stop_value'])
        step = checkpoint['step']
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        model.train()
    '''

    #for i in tqdm(range(1,config.num_epochs * len(dataset) // config.batch_size + 1),desc="Training"):
    for i, (minibatch, target) in enumerate(dataloader):
        ##### Get a mini batch of data from federated dataset
        #minibatch ,_ = next(dataloader)
        #print(minibatch)
        #print(minibatch.size())
        #exit()
        #minibatch = next(dataloader)
        step += 1
        if model_name == 'LSTUR':
            y_pred = model(minibatch["user"], minibatch["clicked_news_length"],
                           minibatch["candidate_news"],
                           minibatch["clicked_news"])
        elif model_name == 'HiFiArk':
            y_pred, regularizer_loss = model(minibatch["candidate_news"],
                                             minibatch["clicked_news"])
        elif model_name == 'TANR':
            y_pred, topic_classification_loss = model(
                minibatch["candidate_news"], minibatch["clicked_news"])
        else:
            #################################################
            # Send the model
            model.send(minibatch.location)
            minibatch, target = minibatch.to(device), target.to(device)
            #minibatch = minibatch.to(device)
            #################################################

            y_pred = model(minibatch)

        #y = torch.zeros(config.batch_size).long().to(device)
        #print(y_pred.get().size())
        #print(y.size())
        loss = criterion(y_pred, target)

        if model_name == 'HiFiArk':
            if i % 10 == 0:
                writer.add_scalar('Train/BaseLoss', loss.get(), step)
                writer.add_scalar('Train/RegularizerLoss',
                                  regularizer_loss.get(), step)
                writer.add_scalar('Train/RegularizerBaseRatio',
                                  regularizer_loss.get() / loss.get(), step)
            loss += config.regularizer_loss_weight * regularizer_loss
        elif model_name == 'TANR':
            if i % 10 == 0:
                writer.add_scalar('Train/BaseLoss', loss.item(), step)
                writer.add_scalar('Train/TopicClassificationLoss',
                                  topic_classification_loss.item(), step)
                writer.add_scalar(
                    'Train/TopicBaseRatio',
                    topic_classification_loss.item() / loss.item(), step)
            loss += config.topic_classification_loss_weight * topic_classification_loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        model.get()

        loss = loss.get().detach().cpu().item()
        loss_full.append(loss)

        if i % 10 == 0:
            writer.add_scalar('Train/Loss', loss, step)

        if i % config.num_batches_show_loss == 0:
            #print(loss_full)
            #print(type(loss_full))
            tqdm.write(
                f"Time {time_since(start_time)}, batches {i}, current loss {loss:.4f}, average loss: {np.mean(loss_full):.4f}, latest average loss: {np.mean(loss_full[-256:]):.4f}"
            )

        if i % config.num_batches_validate == 0:
            (model if model_name != 'Exp1' else models[0]).eval()
            val_auc, val_mrr, val_ndcg5, val_ndcg10 = evaluate(
                model if model_name != 'Exp1' else models[0], './data/val',
                200000)
            (model if model_name != 'Exp1' else models[0]).train()
            writer.add_scalar('Validation/AUC', val_auc, step)
            writer.add_scalar('Validation/MRR', val_mrr, step)
            writer.add_scalar('Validation/nDCG@5', val_ndcg5, step)
            writer.add_scalar('Validation/nDCG@10', val_ndcg10, step)
            tqdm.write(
                f"Time {time_since(start_time)}, batches {i}, validation AUC: {val_auc:.4f}, validation MRR: {val_mrr:.4f}, validation nDCG@5: {val_ndcg5:.4f}, validation nDCG@10: {val_ndcg10:.4f}, "
            )

            early_stop, get_better = early_stopping(-val_auc)
            if early_stop:
                tqdm.write('Early stop.')
                break
            elif get_better:
                try:
                    torch.save(
                        {
                            'model_state_dict': (model if model_name != 'Exp1'
                                                 else models[0]).state_dict(),
                            'optimizer_state_dict':
                            (optimizer if model_name != 'Exp1' else
                             optimizefrs[0]).state_dict(),
                            'step':
                            step,
                            'early_stop_value':
                            -val_auc
                        }, f"./checkpoint/{model_name}/ckpt-{step}.pth")
                except OSError as error:
                    print(f"OS error: {error}")
コード例 #9
0
ファイル: train.py プロジェクト: VSPW-dataset/VSPW_baseline
def main( gpu,cfg,args):
    # Network Builders

    load_gpu = gpu+args.start_gpu
    rank = gpu
    torch.cuda.set_device(load_gpu)
    dist.init_process_group(
        backend='nccl',
        init_method='tcp://127.0.0.1:{}'.format(args.port),
        world_size=args.gpu_num,
        rank=rank,
        timeout=datetime.timedelta(seconds=300))
            # self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model).cuda(self.gpu)


    if args.use_float16:
        from torch.cuda.amp import autocast as autocast, GradScaler
        scaler = GradScaler()
    else:
        scaler = None
        autocast = None

    label_num_=args.num_class
    net_encoder = ModelBuilder.build_encoder(
        arch=cfg.MODEL.arch_encoder.lower(),
        fc_dim=cfg.MODEL.fc_dim,
        weights=cfg.MODEL.weights_encoder)
    net_decoder = ModelBuilder.build_decoder(
        arch=cfg.MODEL.arch_decoder.lower(),
        fc_dim=cfg.MODEL.fc_dim,
        num_class=label_num_,
        weights=cfg.MODEL.weights_decoder)

    crit = nn.NLLLoss(ignore_index=255)

    if cfg.MODEL.arch_decoder.endswith('deepsup'):
        segmentation_module = SegmentationModule(
            net_encoder, net_decoder, crit, cfg.TRAIN.deep_sup_scale)
    else:
        segmentation_module = SegmentationModule(
            net_encoder, net_decoder, crit)

    if args.use_clipdataset:
        dataset_train = BaseDataset_longclip(args,'train')
    else:
        dataset_train = BaseDataset(
            args,
            'train'
            )

    sampler_train =torch.utils.data.distributed.DistributedSampler(dataset_train)
    loader_train = torch.utils.data.DataLoader(dataset_train,  batch_size=args.batchsize,  shuffle=False,sampler=sampler_train,   pin_memory=True,
                                    num_workers=args.workers)


    print('1 Epoch = {} iters'.format(cfg.TRAIN.epoch_iters))

    dataset_val = BaseDataset(
        args,
        'val'
        )
    sampler_val =torch.utils.data.distributed.DistributedSampler(dataset_val)
    loader_val = torch.utils.data.DataLoader(dataset_val,  batch_size=args.batchsize,  shuffle=False,sampler=sampler_val,   pin_memory=True,
                                    num_workers=args.workers)
#    loader_val = torch.utils.data.DataLoader(dataset_val,batch_size=args.batchsize,shuffle=False,num_workers=args.workers)
    # create loader iterator
    

    # load nets into gpu

    segmentation_module = segmentation_module.cuda(load_gpu)

    segmentation_module= nn.SyncBatchNorm.convert_sync_batchnorm(segmentation_module)

    if args.resume_epoch!=0:
#        if dist.get_rank() == 0:
        to_load = torch.load(os.path.join('./resume','model_epoch_{}.pth'.format(args.resume_epoch)),map_location=torch.device("cuda:"+str(load_gpu)))
        from collections import OrderedDict
        new_state_dict = OrderedDict()
        for k, v in to_load.items():
            name = k[7:] # remove `module.`,表面从第7个key值字符取到最后一个字符,正好去掉了module.
            new_state_dict[name] = v #新字典的key值对应的value为一一对应的值。
        cfg.TRAIN.start_epoch=args.resume_epoch
        segmentation_module.load_state_dict(new_state_dict)


    segmentation_module= torch.nn.parallel.DistributedDataParallel(
                    segmentation_module,
                device_ids=[load_gpu],
                find_unused_parameters=True)

    # Set up optimizers
#    nets = (net_encoder, net_decoder, crit)
    nets = segmentation_module
    optimizers = create_optimizers(segmentation_module, cfg)
    if args.resume_epoch!=0:
#        if dist.get_rank() == 0:
        optimizers.load_state_dict(torch.load(os.path.join('./resume','opt_epoch_{}.pth'.format(args.resume_epoch)),map_location=torch.device("cuda:"+str(load_gpu))))
        print('resume from epoch {}'.format(args.resume_epoch))

    # Main loop
    history = {'train': {'epoch': [], 'loss': [], 'acc': []}}

#    test(segmentation_module,loader_val,args)
    for epoch in range(cfg.TRAIN.start_epoch, cfg.TRAIN.num_epoch):
        if dist.get_rank() == 0 and epoch==0:
            checkpoint(nets,optimizers, history, args, epoch+1)
        print('Epoch {}'.format(epoch))
        train(segmentation_module, loader_train, optimizers, history, epoch+1, cfg,args,load_gpu,scaler=scaler,autocast=autocast)

###################        # checkpointing
        if dist.get_rank() == 0 and (epoch+1)%10==0:
            checkpoint(segmentation_module,optimizers, history, args, epoch+1)
        if args.validation:
            test(segmentation_module,loader_val,args)

    print('Training Done!')
コード例 #10
0
def train():
    writer = SummaryWriter(
        log_dir=
        f"./runs/{model_name}/{datetime.datetime.now().replace(microsecond=0).isoformat()}{'-' + os.environ['REMARK'] if 'REMARK' in os.environ else ''}"
    )

    if not os.path.exists('checkpoint'):
        os.makedirs('checkpoint')

    try:
        pretrained_word_embedding = torch.from_numpy(
            np.load('./data/train/pretrained_word_embedding.npy')).float()
    except FileNotFoundError:
        pretrained_word_embedding = None

    if model_name == 'DKN':
        try:
            pretrained_entity_embedding = torch.from_numpy(
                np.load(
                    './data/train/pretrained_entity_embedding.npy')).float()
        except FileNotFoundError:
            pretrained_entity_embedding = None

        try:
            pretrained_context_embedding = torch.from_numpy(
                np.load(
                    './data/train/pretrained_context_embedding.npy')).float()
        except FileNotFoundError:
            pretrained_context_embedding = None

        model = Model(config, pretrained_word_embedding,
                      pretrained_entity_embedding,
                      pretrained_context_embedding).to(device)
    elif model_name == 'Exp1':
        models = nn.ModuleList([
            Model(config, pretrained_word_embedding).to(device)
            for _ in range(config.ensemble_factor)
        ])
    elif model_name == 'Exp2':
        model = Model(config).to(device)
    else:
        model = Model(config, pretrained_word_embedding).to(device)

    if model_name != 'Exp1':
        print(model)
    else:
        print(models[0])

    dataset = BaseDataset('data/train/behaviors_parsed.tsv',
                          'data/train/news_parsed.tsv', 'data/train/roberta')

    print(f"Load training dataset with size {len(dataset)}.")

    dataloader = iter(
        DataLoader(dataset,
                   batch_size=config.batch_size,
                   shuffle=True,
                   num_workers=config.num_workers,
                   drop_last=True,
                   pin_memory=True))
    if model_name != 'Exp1':
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=config.learning_rate)
    else:
        criterion = nn.NLLLoss()
        optimizers = [
            torch.optim.Adam(model.parameters(), lr=config.learning_rate)
            for model in models
        ]
    start_time = time.time()
    loss_full = []
    exhaustion_count = 0
    step = 0
    early_stopping = EarlyStopping()

    checkpoint_dir = os.path.join('./checkpoint', model_name)
    Path(checkpoint_dir).mkdir(parents=True, exist_ok=True)

    checkpoint_path = latest_checkpoint(checkpoint_dir)
    if checkpoint_path is not None:
        print(f"Load saved parameters in {checkpoint_path}")
        checkpoint = torch.load(checkpoint_path)
        early_stopping(checkpoint['early_stop_value'])
        step = checkpoint['step']
        if model_name != 'Exp1':
            model.load_state_dict(checkpoint['model_state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            model.train()
        else:
            for model in models:
                model.load_state_dict(checkpoint['model_state_dict'])
                model.train()
            for optimizer in optimizers:
                optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    for i in tqdm(range(
            1,
            config.num_epochs * len(dataset) // config.batch_size + 1),
                  desc="Training"):
        try:
            minibatch = next(dataloader)
        except StopIteration:
            exhaustion_count += 1
            tqdm.write(
                f"Training data exhausted for {exhaustion_count} times after {i} batches, reuse the dataset."
            )
            dataloader = iter(
                DataLoader(dataset,
                           batch_size=config.batch_size,
                           shuffle=True,
                           num_workers=config.num_workers,
                           drop_last=True,
                           pin_memory=True))
            minibatch = next(dataloader)

        step += 1
        if model_name == 'LSTUR':
            y_pred = model(minibatch["user"], minibatch["clicked_news_length"],
                           minibatch["candidate_news"],
                           minibatch["clicked_news"])
        elif model_name == 'HiFiArk':
            y_pred, regularizer_loss = model(minibatch["candidate_news"],
                                             minibatch["clicked_news"])
        elif model_name == 'TANR':
            y_pred, topic_classification_loss = model(
                minibatch["candidate_news"], minibatch["clicked_news"])
        elif model_name == 'Exp1':
            y_preds = [
                model(minibatch["candidate_news"], minibatch["clicked_news"])
                for model in models
            ]
            y_pred_averaged = torch.stack(
                [F.softmax(y_pred, dim=1) for y_pred in y_preds],
                dim=-1).mean(dim=-1)
            y_pred = torch.log(y_pred_averaged)
        else:
            y_pred = model(minibatch["candidate_news"],
                           minibatch["clicked_news"])

        y = torch.zeros(len(y_pred)).long().to(device)
        loss = criterion(y_pred, y)

        if model_name == 'HiFiArk':
            if i % 10 == 0:
                writer.add_scalar('Train/BaseLoss', loss.item(), step)
                writer.add_scalar('Train/RegularizerLoss',
                                  regularizer_loss.item(), step)
                writer.add_scalar('Train/RegularizerBaseRatio',
                                  regularizer_loss.item() / loss.item(), step)
            loss += config.regularizer_loss_weight * regularizer_loss
        elif model_name == 'TANR':
            if i % 10 == 0:
                writer.add_scalar('Train/BaseLoss', loss.item(), step)
                writer.add_scalar('Train/TopicClassificationLoss',
                                  topic_classification_loss.item(), step)
                writer.add_scalar(
                    'Train/TopicBaseRatio',
                    topic_classification_loss.item() / loss.item(), step)
            loss += config.topic_classification_loss_weight * topic_classification_loss
        loss_full.append(loss.item())
        if model_name != 'Exp1':
            optimizer.zero_grad()
        else:
            for optimizer in optimizers:
                optimizer.zero_grad()
        loss.backward()
        if model_name != 'Exp1':
            optimizer.step()
        else:
            for optimizer in optimizers:
                optimizer.step()

        if i % 10 == 0:
            writer.add_scalar('Train/Loss', loss.item(), step)

        if i % config.num_batches_show_loss == 0:
            tqdm.write(
                f"Time {time_since(start_time)}, batches {i}, current loss {loss.item():.4f}, average loss: {np.mean(loss_full):.4f}, latest average loss: {np.mean(loss_full[-256:]):.4f}"
            )

        if i % config.num_batches_validate == 0:
            (model if model_name != 'Exp1' else models[0]).eval()
            val_auc, val_mrr, val_ndcg5, val_ndcg10 = evaluate(
                model if model_name != 'Exp1' else models[0], './data/val',
                200000)
            (model if model_name != 'Exp1' else models[0]).train()
            writer.add_scalar('Validation/AUC', val_auc, step)
            writer.add_scalar('Validation/MRR', val_mrr, step)
            writer.add_scalar('Validation/nDCG@5', val_ndcg5, step)
            writer.add_scalar('Validation/nDCG@10', val_ndcg10, step)
            tqdm.write(
                f"Time {time_since(start_time)}, batches {i}, validation AUC: {val_auc:.4f}, validation MRR: {val_mrr:.4f}, validation nDCG@5: {val_ndcg5:.4f}, validation nDCG@10: {val_ndcg10:.4f}, "
            )

            early_stop, get_better = early_stopping(-val_auc)
            if early_stop:
                tqdm.write('Early stop.')
                break
            elif get_better:
                try:
                    torch.save(
                        {
                            'model_state_dict': (model if model_name != 'Exp1'
                                                 else models[0]).state_dict(),
                            'optimizer_state_dict':
                            (optimizer if model_name != 'Exp1' else
                             optimizers[0]).state_dict(),
                            'step':
                            step,
                            'early_stop_value':
                            -val_auc
                        }, f"./checkpoint/{model_name}/ckpt-{step}.pth")
                except OSError as error:
                    print(f"OS error: {error}")
コード例 #11
0
def train(fed_num):
    VirtualWorker = []
    hook = sy.TorchHook(torch)  # <-- NEW: hook PyTorch ie add extra functionalities to support Federated Learning
    for i in range(fed_num):
        VirtualWorker.append(sy.VirtualWorker(hook, id=str(i)))
    VirtualWorker = tuple(VirtualWorker)
    secure_worker = sy.VirtualWorker(hook, id="secure_worker")
    #bob = sy.VirtualWorker(hook, id="bob")  # <-- NEW: define remote worker bob
    #alice = sy.VirtualWorker(hook, id="alice")  # <-- NEW: and alice
    #celine = sy.VirtualWorker(hook, id="celine")
    #david = sy.VirtualWorker(hook, id="david")
    #elsa = sy.VirtualWorker(hook, id="elsa")
    writer = SummaryWriter(
        log_dir=
        f"./runs/{model_name}/{datetime.datetime.now().replace(microsecond=0).isoformat()}{'-' + os.environ['REMARK'] if 'REMARK' in os.environ else ''}"
    )

    if not os.path.exists('checkpoint'):
        os.makedirs('checkpoint')

    try:
        pretrained_word_embedding = torch.from_numpy(
            np.load('./data/train/pretrained_word_embedding.npy')).float()
    except FileNotFoundError:
        pretrained_word_embedding = None

    model = Model(config, pretrained_word_embedding)
    print(model)

    dataset = BaseDataset('./data/train/behaviors_parsed.tsv',
                          './data/train/news_parsed.tsv', 
                          './data/train/roberta')

    print(f"Load training dataset with size {len(dataset)}.")
    ###############################################

    ###############################################
    # In the step we need to tranform the dataset in federated manner
    #print(dataset)
    dataloader = sy.FederatedDataLoader(dataset.federate(VirtualWorker),
                                            batch_size=config.batch_size,
                                            shuffle=True,
                                            num_workers=config.num_workers,
                                            drop_last=True,
                                            pin_memory=True
                                           )
    ###############################################
    print(f"The training dataset has been loaded!")
    
    #optimizer = torch.optim.SGD(model.parameters(),lr=config.learning_rate)
        
    start_time = time.time()
    loss_full = []
    exhaustion_count = 0
    step = 0
    early_stopping = EarlyStopping()

    checkpoint_dir = os.path.join('./checkpoint', model_name)
    Path(checkpoint_dir).mkdir(parents=True, exist_ok=True)

    checkpoint_path = latest_checkpoint(checkpoint_dir)
    '''
    if checkpoint_path is not None:
        print(f"Load saved parameters in {checkpoint_path}")
        checkpoint = torch.load(checkpoint_path)
        early_stopping(checkpoint['early_stop_value'])
        step = checkpoint['step']
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        model.train()
    '''

    #for i in tqdm(range(1,config.num_epochs * len(dataset) // config.batch_size + 1),desc="Training"):
    for _ in range(config.num_epochs):
        models = []
        criterion = nn.CrossEntropyLoss()

        for i in range(fed_num):
            models.append(model.to(device).copy().send(str(i)))
            #criterions.append(nn.CrossEntropyLoss())
        optimizers = []
        for i in range(fed_num):
            optimizers.append(
                torch.optim.Adam(models[i].parameters(),
                                 lr=config.learning_rate)
            )

        for i, (minibatch, target) in enumerate(dataloader):
            step += 1
            minibatch, target = minibatch.to(device), target.to(device)
            location = minibatch.location

            predicts = [0 for _ in range(fed_num)]
            losses = [0 for _ in range(fed_num)]
            for j in range(fed_num):
                if VirtualWorker[j] != location:
                    continue
                else:
                    optimizers[j].zero_grad()
                    predicts[j] = models[j](minibatch)
                    losses[j] = criterion(predicts[j], target)
                    losses[j].backward()
                    optimizers[j].step()
                    losses[j] = losses[j].get().cpu().item()

            print(losses)
            loss = np.sum(losses)
            loss_full.append(loss)


            if i % 10 == 0:
                writer.add_scalar('Train/Loss', loss, step)

            if i % config.num_batches_show_loss == 0:
                tqdm.write(
                    f"Time {time_since(start_time)}, batches {i}, current loss {loss:.4f}, average loss: {np.mean(loss_full):.4f}, latest average loss: {np.mean(loss_full[-256:]):.4f}"
                )

            if (i % config.num_batches_validate == 0) and (i!=0):
                with torch.no_grad():
                    paraDict = model.state_dict()
                    #model_temp = [0 for _ in range(fed_num)]
                    parasDict = []
                    for k in range(fed_num):
                        #model_temp[k] = models[k].copy().send(secure_worker)
                        models[k].move(secure_worker)
                        parasDict.append(models[k].state_dict())
                    for name in paraDict:
                        paraDict[name] = parasDict[0][name].clone().get()
                        for index in range(1, fed_num):
                            paraDict[name] += parasDict[index][name].clone().get()
                        paraDict[name] /= fed_num
                model.load_state_dict(paraDict)
                    #model = model.to(device)
                models = []
                for index in range(fed_num):
                    models.append(model.to(device).copy().send(str(index)))
                model.eval()
                val_auc, val_mrr, val_ndcg5, val_ndcg10 = evaluate(
                    model, './data/val',
                    200000)
                model.train()
                writer.add_scalar('Validation/AUC', val_auc, step)
                writer.add_scalar('Validation/MRR', val_mrr, step)
                writer.add_scalar('Validation/nDCG@5', val_ndcg5, step)
                writer.add_scalar('Validation/nDCG@10', val_ndcg10, step)
                tqdm.write(
                    f"Time {time_since(start_time)}, batches {i}, validation AUC: {val_auc:.4f}, validation MRR: {val_mrr:.4f}, validation nDCG@5: {val_ndcg5:.4f}, validation nDCG@10: {val_ndcg10:.4f}, "
                )

                early_stop, get_better = early_stopping(-val_auc)
                if early_stop:
                    tqdm.write('Early stop.')
                    break
                '''
コード例 #12
0
def train():
    writer = SummaryWriter(
        log_dir=
        f"./runs/{model_name}/{datetime.datetime.now().replace(microsecond=0).isoformat()}-{Config.num_batches_classification:04}-{Config.joint_loss}"
    )

    try:
        pretrained_word_embedding = torch.from_numpy(
            np.load('./data/train/pretrained_word_embedding.npy')).float()
    except FileNotFoundError:
        pretrained_word_embedding = None

    model = Model(Config, pretrained_word_embedding, writer).to(device)

    print(model)

    dataset = BaseDataset('data/train/behaviors_parsed.tsv',
                          'data/train/news_parsed.tsv',
                          Config.dataset_attributes)

    print(f"Load training dataset with size {len(dataset)}.")

    dataloader = iter(
        DataLoader(dataset,
                   batch_size=Config.batch_size,
                   shuffle=True,
                   num_workers=Config.num_workers,
                   drop_last=True))

    checkpoint_dir = os.path.join('./checkpoint', model_name)
    Path(checkpoint_dir).mkdir(parents=True, exist_ok=True)

    if Config.num_batches_classification != 0:
        step_classification = 0
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=Config.learning_rate)
        start_time = time.time()
        with tqdm(total=Config.num_batches_classification,
                  desc="Training (classification)") as pbar:
            for i in range(1, Config.num_batches_classification + 1):
                try:
                    minibatch = next(dataloader)
                except StopIteration:
                    dataloader = iter(
                        DataLoader(dataset,
                                   batch_size=Config.batch_size,
                                   shuffle=True,
                                   num_workers=Config.num_workers,
                                   drop_last=True))
                    minibatch = next(dataloader)

                step_classification += 1

                _, topic_classification_loss = model(
                    minibatch["candidate_news"],
                    minibatch["clicked_news"],
                    classification_only=True)

                writer.add_scalar('Train(classification)/Loss',
                                  topic_classification_loss.item(),
                                  step_classification)
                optimizer.zero_grad()
                topic_classification_loss.backward()
                optimizer.step()

                if i % Config.num_batches_show_loss == 0:
                    tqdm.write(
                        f"Time {time_since(start_time)}, batches {i}, current loss {topic_classification_loss.item():.4f}"
                    )

                pbar.update(1)

    loss_full = []
    exhaustion_count = 0
    step = 0
    optimizer = torch.optim.Adam(model.parameters(), lr=Config.learning_rate)
    start_time = time.time()

    early_stopping = EarlyStopping()

    with tqdm(total=Config.num_batches, desc="Training") as pbar:
        for i in range(1, Config.num_batches + 1):
            try:
                minibatch = next(dataloader)
            except StopIteration:
                exhaustion_count += 1
                tqdm.write(
                    f"Training data exhausted for {exhaustion_count} times after {i} batches, reuse the dataset."
                )
                dataloader = iter(
                    DataLoader(dataset,
                               batch_size=Config.batch_size,
                               shuffle=True,
                               num_workers=Config.num_workers,
                               drop_last=True))
                minibatch = next(dataloader)

            step += 1

            y_pred, topic_classification_loss = model(
                minibatch["candidate_news"], minibatch["clicked_news"])
            loss = torch.stack([x[0] for x in -F.log_softmax(y_pred, dim=1)
                                ]).mean()
            if Config.joint_loss:
                writer.add_scalar('Train/BaseLoss', loss.item(), step)
                writer.add_scalar('Train/TopicClassificationLoss',
                                  topic_classification_loss.item(), step)
                writer.add_scalar(
                    'Train/TopicBaseRatio',
                    topic_classification_loss.item() / loss.item(), step)
                loss += Config.topic_classification_loss_weight * topic_classification_loss
            loss_full.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            writer.add_scalar('Train/Loss', loss.item(), step)

            if i % Config.num_batches_show_loss == 0:
                tqdm.write(
                    f"Time {time_since(start_time)}, batches {i}, current loss {loss.item():.4f}, average loss: {np.mean(loss_full):.4f}"
                )

            if i % Config.num_batches_validate == 0:
                val_auc, val_mrr, val_ndcg5, val_ndcg10 = evaluate(
                    model, './data/val')
                writer.add_scalar('Validation/AUC', val_auc, step)
                writer.add_scalar('Validation/MRR', val_mrr, step)
                writer.add_scalar('Validation/nDCG@5', val_ndcg5, step)
                writer.add_scalar('Validation/nDCG@10', val_ndcg10, step)
                tqdm.write(
                    f"Time {time_since(start_time)}, batches {i}, validation AUC: {val_auc:.4f}, validation MRR: {val_mrr:.4f}, validation nDCG@5: {val_ndcg5:.4f}, validation nDCG@10: {val_ndcg10:.4f}, "
                )

                early_stop, get_better = early_stopping(-val_auc)
                if early_stop:
                    tqdm.write('Early stop.')
                    break
                elif get_better:
                    torch.save({'model_state_dict': model.state_dict()},
                               f"./checkpoint/{model_name}/ckpt-{step}.pth")

            pbar.update(1)
コード例 #13
0
    def __init__(self, opt):
        self.opt = opt
        print(opt.dir + "/data.txt")
        assert os.path.exists(
            opt.dir + "/data.txt"), "No data.txt found in specified dir"
        assert os.path.exists(
            opt.dir + "/label.txt"), "No label.txt found in specified dir"

        train_dir = opt.data_dir + "/TrainSet/"
        val_dir = opt.data_dir + "/ValidateSet/"
        test_dir = opt.data_dir + "/TestSet/"

        # split data
        if not all([
                os.path.exists(train_dir),
                os.path.exists(val_dir),
                os.path.exists(test_dir)
        ]):
            # rm existing directories
            rmdir(train_dir)
            rmdir(val_dir)
            rmdir(test_dir)

            # split data to Train, Val, Test
            logging.info("Split raw data to Train, Val and Test")
            ratios = opt.ratio
            dataset = collections.defaultdict(list)
            with open(opt.dir + '/data.txt') as d:
                for line in d.readlines():
                    line = json.loads(line)
                    # if data has been specified data_type yet, load data as what was specified before
                    if line.has_key("type"):
                        dataset[line["type"]].append(line)
                        continue
                    # specified data_type randomly
                    rand = random.random()
                    if rand < ratios[0]:
                        data_type = "Train"
                    elif rand < ratios[0] + ratios[1]:
                        data_type = "Validate"
                    else:
                        data_type = "Test"
                    dataset[data_type].append(line)
            # write to file
            self._WriteDataToFile(dataset["Train"], train_dir)
            self._WriteDataToFile(dataset["Validate"], val_dir)
            self._WriteDataToFile(dataset["Test"], test_dir)

        self.rid2name, self.id2rid, self.rid2id = load_label(opt.dir +
                                                             '/label.txt')
        self.num_classes = [len(item) - 2 for item in self.rid2name]

        # load dataset
        if opt.mode == "Train":
            logging.info("Load Train Dataset...")
            self.train_set = BaseDataset(self.opt, "TrainSet", self.rid2id)
            logging.info("Load Validate Dataset...")
            self.val_set = BaseDataset(self.opt, "ValidateSet", self.rid2id)
        else:
            # force batch_size for test to 1
            self.opt.batch_size = 1
            self.opt.load_thread = 1
            logging.info("Load Test Dataset...")
            self.test_set = BaseDataset(self.opt, "TestSet", self.rid2id)
コード例 #14
0
ファイル: settings.py プロジェクト: yaroslav-dudar/fox.cub
BR_SERIE_B = DatasetAggregator(ObservationDataset('leagues/br_serie_b.json'))
BR_SERIE_A = DatasetAggregator(ObservationDataset('leagues/br_serie_a.json'))
BAHRAIN_PL = DatasetAggregator(ObservationDataset('leagues/bahrain_pl.json'))
BEL_PL = DatasetAggregator(ObservationDataset('leagues/bel_pl.json'))
K_LEAGUE = DatasetAggregator(ObservationDataset('leagues/k_league.json'))
K_LEAGUE2 = DatasetAggregator(ObservationDataset('leagues/k_league2.json'))
COSTA_RICA = DatasetAggregator(ObservationDataset('leagues/costa_rica_primera.json'))
NB_1_LIGA = DatasetAggregator(ObservationDataset('leagues/nb_1_liga.json'))
Eliteserien = DatasetAggregator(ObservationDataset('leagues/eliteserien.json'))
Allsvenskan = DatasetAggregator(ObservationDataset('leagues/allsvenskan.json'))
CHINA_SUPER_LEAGUE = DatasetAggregator(ObservationDataset('leagues/china_super_league.json'))

FA_CUP = DatasetAggregator(
    ObservationDataset('cups/fa_cup.json'),
    FeatureDataset([
        BaseDataset.from_file('leagues/epl.json', {'strength': 0}),
        BaseDataset.from_file('leagues/efl_championship.json', {'strength': 1}),
        BaseDataset.from_file('leagues/efl_league1.json', {'strength': 2}),
        BaseDataset.from_file('leagues/efl_league2.json', {'strength': 3})
        ])
)

LEAGUE_CUP = DatasetAggregator(
    ObservationDataset('cups/league_cup.json'),
    FeatureDataset([
        BaseDataset.from_file('leagues/epl.json', {'strength': 0}),
        BaseDataset.from_file('leagues/efl_championship.json', {'strength': 1}),
        BaseDataset.from_file('leagues/efl_league1.json', {'strength': 2}),
        BaseDataset.from_file('leagues/efl_league2.json', {'strength': 3})
        ])
)