Ejemplo n.º 1
0
    def __setup_model_hparams(self):

        # 1. define losses
        self.loss = nn.MSELoss()
        self.loss_adv = nn.BCELoss()

        # 2. define model metric
        self.metric = Kappa()

        # 3. define optimizer
        self.optimizer = eval(f"torch.optim.{self.hparams['optimizer_name']}")(
            params=self.model.parameters(),
            **self.hparams['optimizer_hparams'])

        # 4. define scheduler
        self.scheduler = eval(
            f"torch.optim.lr_scheduler.{self.hparams['scheduler_name']}")(
                optimizer=self.optimizer, **self.hparams['scheduler_hparams'])

        # 5. define early stopping
        self.early_stopping = EarlyStopping(
            checkpoint_path=self.hparams['checkpoint_path'] +
            f'/checkpoint_{self.start_training}' + '.pt',
            patience=self.hparams['patience'],
            delta=self.hparams['min_delta'],
            is_maximize=True,
        )

        # 6. set gradient clipping
        self.apply_clipping = self.hparams['clipping']  # clipping of gradients

        # 7. Set scaler for optimizer
        self.scaler = torch.cuda.amp.GradScaler()

        return True
Ejemplo n.º 2
0
    def __setup_model_hparams(self):

        # 1. define losses
        self.loss = SimCLR_2(
            temperature=100
        )  # SimclrCriterion(batch_size=self.hparams['batch_size'],device=self.device)

        # 2. define optimizer
        self.optimizer = eval(f"torch.optim.{self.hparams['optimizer_name']}")(
            params=self.model.parameters(),
            **self.hparams['optimizer_hparams'])

        # 3. define scheduler
        self.scheduler = eval(
            f"torch.optim.lr_scheduler.{self.hparams['scheduler_name']}")(
                optimizer=self.optimizer, **self.hparams['scheduler_hparams'])

        # 4. define early stopping
        self.early_stopping = EarlyStopping(
            checkpoint_path=self.hparams['checkpoint_path'] +
            f'/checkpoint_{self.start_training}' + '.pt',
            patience=self.hparams['patience'],
            delta=self.hparams['min_delta'],
            is_maximize=False,
        )

        # 5. set gradient clipping
        self.apply_clipping = self.hparams['clipping']  # clipping of gradients

        # 6. Set scaler for optimizer
        self.scaler = torch.cuda.amp.GradScaler()

        return True
Ejemplo n.º 3
0
def main(opt):
    """
    train_dataset = BADataset(opt.dataroot, opt.L, True, False, False)
    train_dataloader = BADataloader(train_dataset, batch_size=opt.batchSize, \
                                      shuffle=True, num_workers=opt.workers, drop_last=True)

    valid_dataset = BADataset(opt.dataroot, opt.L, False, True, False)
    valid_dataloader = BADataloader(valid_dataset, batch_size=opt.batchSize, \
                                     shuffle=True, num_workers=opt.workers, drop_last=True)

    test_dataset = BADataset(opt.dataroot, opt.L, False, False, True)
    test_dataloader = BADataloader(test_dataset, batch_size=opt.batchSize, \
                                     shuffle=True, num_workers=opt.workers, drop_last=True)
    """
    all_dataset = BADataset(opt.dataroot, opt.L, False, False, False)
    all_dataloader = BADataloader(all_dataset, batch_size=opt.batchSize, \
                                     shuffle=False, num_workers=opt.workers, drop_last=False)

    net = COSSIMMLP(opt)
    net.double()
    print(net)

    criterion = nn.BCELoss()

    if opt.cuda:
        net.cuda()
        criterion.cuda()

    #optimizer = optim.Adam(net.parameters(), lr=opt.lr)
    optimizer = ""
    early_stopping = EarlyStopping(patience=opt.patience, verbose=True)

    os.makedirs(OutputDir, exist_ok=True)
    train_loss_ls = []
    valid_loss_ls = []
    test_loss_ls = []

    for epoch in range(0, opt.niter):
        # train_loss = train(epoch, train_dataloader, net, criterion, optimizer, opt)
        # valid_loss = valid(valid_dataloader, net, criterion, opt)
        # test_loss = test(test_dataloader, net, criterion, opt)
        train_loss = 0
        valid_loss = 0
        test_loss = 0

        train_loss_ls.append(train_loss)
        valid_loss_ls.append(valid_loss)
        test_loss_ls.append(test_loss)

        early_stopping(valid_loss, net, OutputDir)
        if early_stopping.early_stop:
            print("Early stopping")
            break

    df = pd.DataFrame({'epoch':[i for i in range(1, len(train_loss_ls)+1)], 'train_loss': train_loss_ls, 'valid_loss': valid_loss_ls, 'test_loss': test_loss_ls})
    df.to_csv(OutputDir + '/loss.csv', index=False)

    net.load_state_dict(torch.load(OutputDir + '/checkpoint.pt'))
    inference(all_dataloader, net, criterion, opt, OutputDir)
Ejemplo n.º 4
0
def main(opt):
    train_dataset = BADataset(opt.dataroot, opt.L, True, False, False)
    train_dataloader = BADataloader(train_dataset, batch_size=opt.batchSize, \
                                      shuffle=True, num_workers=opt.workers, drop_last=True)

    valid_dataset = BADataset(opt.dataroot, opt.L, False, True, False)
    valid_dataloader = BADataloader(valid_dataset, batch_size=opt.batchSize, \
                                     shuffle=True, num_workers=opt.workers, drop_last=True)

    test_dataset = BADataset(opt.dataroot, opt.L, False, False, True)
    test_dataloader = BADataloader(test_dataset, batch_size=opt.batchSize, \
                                     shuffle=True, num_workers=opt.workers, drop_last=True)

    all_dataset = BADataset(opt.dataroot, opt.L, False, False, False)
    all_dataloader = BADataloader(all_dataset, batch_size=opt.batchSize, \
                                     shuffle=False, num_workers=opt.workers, drop_last=False)

    opt.n_edge_types = train_dataset.n_edge_types
    opt.n_node = train_dataset.n_node

    net = STGGNN(opt, kernel_size=2, n_blocks=1, state_dim_bottleneck=opt.state_dim, annotation_dim_bottleneck=opt.annotation_dim)
    net.double()
    print(net)

    criterion = nn.BCELoss()

    if opt.cuda:
        net.cuda()
        criterion.cuda()

    optimizer = optim.Adam(net.parameters(), lr=opt.lr)
    early_stopping = EarlyStopping(patience=opt.patience, verbose=True)

    os.makedirs(OutputDir, exist_ok=True)
    train_loss_ls = []
    valid_loss_ls = []
    test_loss_ls = []

    #net.load_state_dict(torch.load(OutputDir + '/checkpoint_5083.pt'))

    for epoch in range(0, opt.niter):
        train_loss = train(epoch, train_dataloader, net, criterion, optimizer, opt)
        valid_loss = valid(valid_dataloader, net, criterion, opt)
        test_loss = test(test_dataloader, net, criterion, opt)

        train_loss_ls.append(train_loss)
        valid_loss_ls.append(valid_loss)
        test_loss_ls.append(test_loss)

        early_stopping(valid_loss, net, OutputDir)
        if early_stopping.early_stop:
            print("Early stopping")
            break

    df = pd.DataFrame({'epoch':[i for i in range(1, len(train_loss_ls)+1)], 'train_loss': train_loss_ls, 'valid_loss': valid_loss_ls, 'test_loss': test_loss_ls})
    df.to_csv(OutputDir + '/loss.csv', index=False)

    net.load_state_dict(torch.load(OutputDir + '/checkpoint.pt'))
    inference(all_dataloader, net, criterion, opt, OutputDir)
Ejemplo n.º 5
0
def main(opt):
    train_dataset = BADataset(opt.dataroot, opt.L, True, False, False)
    train_dataloader = BADataloader(train_dataset, batch_size=opt.batchSize, \
                                      shuffle=True, num_workers=opt.workers, drop_last=True)

    valid_dataset = BADataset(opt.dataroot, opt.L, False, True, False)
    valid_dataloader = BADataloader(valid_dataset, batch_size=opt.batchSize, \
                                     shuffle=True, num_workers=opt.workers, drop_last=True)

    test_dataset = BADataset(opt.dataroot, opt.L, False, False, True)
    test_dataloader = BADataloader(test_dataset, batch_size=opt.batchSize, \
                                     shuffle=True, num_workers=opt.workers, drop_last=True)

    all_dataset = BADataset(opt.dataroot, opt.L, False, False, False)
    all_dataloader = BADataloader(all_dataset, batch_size=opt.batchSize, \
                                     shuffle=False, num_workers=opt.workers, drop_last=False)

    opt.n_edge_types = train_dataset.n_edge_types
    opt.n_node = train_dataset.n_node

    net = EGCN(gcn_args, activation = torch.nn.RReLU(), device = opt.device)
    print(net)

    criterion = nn.MSELoss()
    #criterion = nn.CosineSimilarity(dim=-1, eps=1e-6)

    if opt.cuda:
        net.cuda()
        criterion.cuda()

    optimizer = optim.Adam(net.parameters(), lr=opt.lr)
    early_stopping = EarlyStopping(patience=opt.patience, verbose=True)

    os.makedirs(OutputDir, exist_ok=True)
    train_loss_ls = []
    valid_loss_ls = []
    test_loss_ls = []

    for epoch in range(0, opt.niter):
        train_loss = train(epoch, train_dataloader, net, criterion, optimizer, opt)
        valid_loss = valid(valid_dataloader, net, criterion, opt)
        test_loss = test(test_dataloader, net, criterion, opt)

        train_loss_ls.append(train_loss)
        valid_loss_ls.append(valid_loss)
        test_loss_ls.append(test_loss)

        early_stopping(valid_loss, net, OutputDir)
        if early_stopping.early_stop:
            print("Early stopping")
            break

    df = pd.DataFrame({'epoch':[i for i in range(1, len(train_loss_ls)+1)], 'train_loss': train_loss_ls, 'valid_loss': valid_loss_ls, 'test_loss': test_loss_ls})
    df.to_csv(OutputDir + '/loss.csv', index=False)

    #net.load_state_dict(torch.load(OutputDir + '/checkpoint.pt'))
    net = torch.load(OutputDir + '/checkpoint.pt')
    inference(all_dataloader, net, criterion, opt, OutputDir)
Ejemplo n.º 6
0
 def __init__(self):
     self.patience = 7
     self.warm_up = 6
     self.patience_decay = {1: 0.8, 2: 0.5, 3: 0}
     self.early_stoping = EarlyStopping(patience=config.patience,
                                        verbose=True)
     self.decay = 0
     self.stop = False
     self.lr_decay_dict = [0.7, 0.9]
Ejemplo n.º 7
0
    def __init__(self, input_size, n_channels, hparams):

        self.hparams = hparams

        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        # define the models
        self.model = WaveNet(n_channels=n_channels).to(self.device)
        summary(self.model, (input_size, n_channels))
        # self.model.half()

        if torch.cuda.device_count() > 1:
            print("Number of GPUs will be used: ",
                  torch.cuda.device_count() - 3)
            self.model = DP(self.model,
                            device_ids=list(
                                range(torch.cuda.device_count() - 3)))
        else:
            print('Only one GPU is available')

        self.metric = Metric()
        self.num_workers = 1
        ########################## compile the model ###############################

        # define optimizer
        self.optimizer = torch.optim.Adam(params=self.model.parameters(),
                                          lr=self.hparams['lr'],
                                          weight_decay=1e-5)

        # weights = torch.Tensor([0.025,0.033,0.039,0.046,0.069,0.107,0.189,0.134,0.145,0.262,1]).cuda()
        self.loss = nn.BCELoss()  # CompLoss(self.device)

        # define early stopping
        self.early_stopping = EarlyStopping(
            checkpoint_path=self.hparams['checkpoint_path'] + '/checkpoint.pt',
            patience=self.hparams['patience'],
            delta=self.hparams['min_delta'],
        )
        # lr cheduler
        self.scheduler = ReduceLROnPlateau(
            optimizer=self.optimizer,
            mode='max',
            factor=0.2,
            patience=3,
            verbose=True,
            threshold=self.hparams['min_delta'],
            threshold_mode='abs',
            cooldown=0,
            eps=0,
        )

        self.seed_everything(42)
        self.threshold = 0.75
        self.scaler = torch.cuda.amp.GradScaler()
Ejemplo n.º 8
0
def main():

    #using kfold
    #train_dataset = GraphemeDataSet(TRAIN_DATA_DIR, TRAIN_DATA_CSV, TRAINING_FOLDS)
    #valid_dataset = GraphemeDataSet(TRAIN_DATA_DIR, TRAIN_DATA_CSV, VALIDATION_FOLDS)

    #using one split : train and validation
    train_dataset = GraphemeDataSet(TRAIN_DATA_DIR,
                                    TRAIN_DATA_CSV,
                                    is_train=True)
    valid_dataset = GraphemeDataSet(TRAIN_DATA_DIR,
                                    TRAIN_DATA_CSV,
                                    is_train=False)

    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=TRAIN_BATCH_SIZE,
                              num_workers=4)

    valid_loader = DataLoader(dataset=valid_dataset,
                              batch_size=TRAIN_BATCH_SIZE,
                              num_workers=4)

    model = MODEL_DISPATCHER[BASE_MODEL](pretrained=True)
    model.to(DEVICE)

    optimizer = torch.optim.Adam(model.parameters())
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           factor=0.3,
                                                           patience=5,
                                                           verbose=True)
    early_stopping = EarlyStopping(patience=7, verbose=True)

    for e in range(EPOCHS):
        print("Epoch {} : ".format(e))
        train(train_loader, model, optimizer, e)
        val_score = evaluate(valid_loader, model, e)
        scheduler.step(val_score)
        early_stopping(val_score, model)
        if early_stopping.early_stop:
            print("Early stopping!")
            break
Ejemplo n.º 9
0
Archivo: ops.py Proyecto: Sdhir/deepCIN
def train_model(model_num,
                model,
                dataloaders,
                num_tr_samples,
                criterion,
                optimizer,
                hist_dir,
                early_stop_patience,
                writer,
                num_epochs=25,
                with_cuda=True):
    since = time.time()

    val_acc_history = []

    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = math.inf
    best_val_acc = 0
    best_epoch = 0
    all_f_ = torch.tensor([])
    all_l_ = torch.tensor([])

    hist_dir = os.path.join(hist_dir, 'model_' + str(model_num))
    if not os.path.exists(hist_dir):
        os.mkdir(hist_dir)

    csv = open(os.path.join(hist_dir, 'eval_history.csv'), 'w')
    csv.write("epoch,train_loss,train_acc,val_loss,val_acc\n")

    monitor = 'val_acc'
    early_stopping = EarlyStopping(patience=early_stop_patience,
                                   monitor=monitor,
                                   verbose=False)

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 12)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()  # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0
            print(phase, '|', end='')
            print(" Dataloader len: ", len(dataloaders[phase].dataset))
            # Iterate over data.
            for i, (inputs, labels) in enumerate(dataloaders[phase]):
                #print(' ',i+1,') vs',inputs.size(1), end = '')
                if torch.cuda.is_available() and with_cuda:
                    inputs = inputs.to('cuda')
                    labels = labels.to('cuda')
                # zero the parameter gradients
                optimizer.zero_grad()
                model._init_hidden_state(last_batch_size=inputs.size(0))

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    # Get model outputs and calculate loss
                    # Special case for inception because in training it has an auxiliary output. In train
                    #   mode we calculate the loss by summing the final output and the auxiliary output
                    #   but in testing we only consider the final output.
                    outputs, features, _ = model(inputs)
                    loss = criterion(outputs, labels)
                    #loss = torch.sum(loss)
                    loss = li_regularizer(model, loss)

                    _, preds = torch.max(outputs, 1)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                    else:
                        if not i:
                            all_f = features
                            all_l = labels
                        else:
                            all_f = torch.cat((all_f, features), dim=0)
                            all_l = torch.cat((all_l, labels), dim=0)

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            if phase == 'train':
                epoch_loss = running_loss / num_tr_samples
                epoch_acc = running_corrects.double() / num_tr_samples
            else:
                epoch_loss = running_loss / len(dataloaders[phase].dataset)
                epoch_acc = running_corrects.double() / len(
                    dataloaders[phase].dataset)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss,
                                                       epoch_acc))

            if phase == 'train':
                tr_loss = epoch_loss
                tr_acc = epoch_acc.cpu().data.numpy()
                writer.add_scalar('loss/train', epoch_loss, epoch)
                writer.add_scalar('acc/train', epoch_acc, epoch)
            else:
                val_loss = epoch_loss
                val_acc = epoch_acc.cpu().data.numpy()
                writer.add_scalar('loss/test', epoch_loss, epoch)
                writer.add_scalar('acc/test', epoch_acc, epoch)

            # deep copy the model
            if phase == 'val':
                if monitor == 'val_acc' and val_acc > best_val_acc:  #epoch_loss < best_loss:
                    best_loss = epoch_loss
                    best_epoch = epoch
                    best_model_wts = copy.deepcopy(model.state_dict())
                    best_val_acc = val_acc
                    all_f_ = all_f
                    all_l_ = all_l

                elif monitor == 'val_loss' and epoch_loss < best_loss:
                    best_loss = epoch_loss
                    best_epoch = epoch
                    best_model_wts = copy.deepcopy(model.state_dict())
                    best_val_acc = val_acc
                    all_f_ = all_f
                    all_l_ = all_l

        print()
        csv.write(
            str(epoch) + ',' + str(tr_loss) + ',' + str(tr_acc) + ',' +
            str(val_loss) + ',' + str(val_acc) + '\n')
        # early_stopping needs the validation loss to check if it has decresed,
        # and if it has, it will make a checkpoint of the current model
        if monitor == 'val_acc':
            early_stopping(val_acc, model, hist_dir)
        else:
            early_stopping(val_loss, model, hist_dir)
        if early_stopping.early_stop:
            print("Early stopping")
            break

    time_elapsed = time.time() - since
    logging.info('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    #print('Best val Acc: {:4f}'.format(best_acc))

    # save best predictions and respective labels
    #np.savez(os.path.join(hist_dir,'model_'+str(model_num)+'.npz'),
    #        pred=best_pred_out.cpu().data.numpy(),gt=best_gt.cpu().data.numpy())
    # load best model weights
    logging.info(
        "best model epoch: {}, val_loss: {:.4f}, val_acc:{:.4f}".format(
            best_epoch, best_loss, best_val_acc))
    check_pt = "wt_best_ep{}_loss_{:.3f}_acc{:.3f}.pth".format(
        best_epoch, best_loss, best_val_acc)
    # save model
    model.load_state_dict(best_model_wts)

    # Best model predictions on the validation data
    #writer.add_figure('predictions_vs._actuals', plot_classes_preds(model, inputs, labels, classes), global_step=model_num)

    torch.save(model.state_dict(), os.path.join(hist_dir, check_pt))
    return model, best_val_acc, best_loss, best_epoch, all_f_, all_l_
Ejemplo n.º 10
0
def train_model(model_num,
                model,
                data,
                criterion,
                optimizer,
                hist_dir,
                early_stop_patience,
                num_epochs=25,
                with_cuda=True):
    since = time.time()

    val_acc_history = []

    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = math.inf
    best_val_acc = 0
    best_epoch = 0

    hist_dir = os.path.join(hist_dir, 'model_' + str(model_num))
    if not os.path.exists(hist_dir):
        os.mkdir(hist_dir)

    csv = open(os.path.join(hist_dir, 'eval_history.csv'), 'w')
    csv.write("epoch,train_loss,train_acc,val_loss,val_acc\n")

    early_stopping = EarlyStopping(patience=early_stop_patience,
                                   monitor='val_acc',
                                   verbose=False)

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 12)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()  # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0
            print(phase, '|', end='')
            inputs, labels, patch_count, _ = data[phase]
            # Iterate over data.
            for i in range(len(patch_count)):
                #print(' ',i+1,') vs',inputs.size(1), end = '')
                input_im = inputs[i, :patch_count[i]]
                assert not torch.isnan(input_im).any()
                label_im = [labels[i].item()]
                label_im = torch.tensor(label_im)
                if torch.cuda.is_available() and with_cuda:
                    input_im = input_im.to('cuda')
                    label_im = label_im.to('cuda')
                # zero the parameter gradients
                optimizer.zero_grad()
                model._init_hidden_state(last_batch_size=1)

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    # Get model outputs and calculate loss
                    # Special case for inception because in training it has an auxiliary output. In train
                    #   mode we calculate the loss by summing the final output and the auxiliary output
                    #   but in testing we only consider the final output.
                    outputs, _, _ = model(input_im)
                    #assert not torch.isnan(outputs).any()
                    loss = criterion(outputs, label_im)
                    #loss = torch.sum(loss)
                    loss = li_regularizer(model, loss)

                    _, preds = torch.max(outputs, 1)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        optimizer.zero_grad()
                        loss.backward(retain_graph=True)
                        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
                        #torch.nn.utils.clip_grad_norm_(model.parameters(), 0.50)
                        optimizer.step()

                # statistics
                running_loss += loss.item()
                running_corrects += torch.sum(preds == label_im.data)

            if phase == 'train':
                epoch_loss = running_loss / len(patch_count)
                epoch_acc = running_corrects.double() / len(patch_count)
            else:
                epoch_loss = running_loss / len(patch_count)
                epoch_acc = running_corrects.double() / len(patch_count)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss,
                                                       epoch_acc))

            if phase == 'train':
                tr_loss = epoch_loss
                tr_acc = epoch_acc.cpu().data.numpy()
            else:
                val_loss = epoch_loss
                val_acc = epoch_acc.cpu().data.numpy()

            # deep copy the model
            if phase == 'val' and val_acc > best_val_acc:
                best_loss = epoch_loss
                best_epoch = epoch
                best_model_wts = copy.deepcopy(model.state_dict())
                best_val_acc = val_acc

            # save model
            #check_pt = "wt_ep{}.pth".format(epoch)
            #torch.save(model.state_dict(),os.path.join(hist_dir,check_pt))
        print()

        csv.write(
            str(epoch) + ',' + str(tr_loss) + ',' + str(tr_acc) + ',' +
            str(val_loss) + ',' + str(val_acc) + '\n')
        # early_stopping needs the validation loss to check if it has decresed,
        # and if it has, it will make a checkpoint of the current model
        early_stopping(val_acc, model, hist_dir)
        if early_stopping.early_stop:
            print("Early stopping")
            break

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print("best model epoch: {}, val_loss: {:.4f}, val_acc:{:.4f}".format(
        best_epoch, best_loss, best_val_acc))
    check_pt = "wt_best_ep{}_acc_{:.3f}.pth".format(best_epoch, best_val_acc)
    # save model
    model.load_state_dict(best_model_wts)

    # Best model predictions on the validation data
    #writer.add_figure('predictions_vs._actuals', plot_classes_preds(model, inputs, labels, classes), global_step=model_num)

    torch.save(model.state_dict(), os.path.join(hist_dir, check_pt))
    return model, best_val_acc, best_loss, best_epoch
Ejemplo n.º 11
0
def run_model_LastFM(feats_type, hidden_dim, num_heads, attn_vec_dim, rnn_type,
                     num_epochs, patience, batch_size, neighbor_samples,
                     repeat, save_postfix):
    adjlists_ua, edge_metapath_indices_list_ua, _, type_mask, train_val_test_pos_user_artist, train_val_test_neg_user_artist = load_LastFM_data(
    )
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    features_list = []
    in_dims = []
    if feats_type == 0:
        for i in range(num_ntype):
            dim = (type_mask == i).sum()
            in_dims.append(dim)
            indices = np.vstack((np.arange(dim), np.arange(dim)))
            indices = torch.LongTensor(indices)
            values = torch.FloatTensor(np.ones(dim))
            features_list.append(
                torch.sparse.FloatTensor(indices, values,
                                         torch.Size([dim, dim])).to(device))
    elif feats_type == 1:
        for i in range(num_ntype):
            dim = 10
            num_nodes = (type_mask == i).sum()
            in_dims.append(dim)
            features_list.append(torch.zeros((num_nodes, 10)).to(device))
    train_pos_user_artist = train_val_test_pos_user_artist[
        'train_pos_user_artist']
    val_pos_user_artist = train_val_test_pos_user_artist['val_pos_user_artist']
    test_pos_user_artist = train_val_test_pos_user_artist[
        'test_pos_user_artist']
    train_neg_user_artist = train_val_test_neg_user_artist[
        'train_neg_user_artist']
    val_neg_user_artist = train_val_test_neg_user_artist['val_neg_user_artist']
    test_neg_user_artist = train_val_test_neg_user_artist[
        'test_neg_user_artist']
    y_true_test = np.array([1] * len(test_pos_user_artist) +
                           [0] * len(test_neg_user_artist))

    auc_list = []
    ap_list = []
    for _ in range(repeat):
        net = MAGNN_lp([3, 3], 4, etypes_lists, in_dims, hidden_dim,
                       hidden_dim, num_heads, attn_vec_dim, rnn_type,
                       dropout_rate)
        net.to(device)
        optimizer = torch.optim.Adam(net.parameters(),
                                     lr=lr,
                                     weight_decay=weight_decay)

        # training loop
        net.train()
        early_stopping = EarlyStopping(
            patience=patience,
            verbose=True,
            save_path='checkpoint/checkpoint_{}.pt'.format(save_postfix))
        dur1 = []
        dur2 = []
        dur3 = []
        train_pos_idx_generator = index_generator(
            batch_size=batch_size, num_data=len(train_pos_user_artist))
        val_idx_generator = index_generator(batch_size=batch_size,
                                            num_data=len(val_pos_user_artist),
                                            shuffle=False)
        for epoch in range(num_epochs):
            t_start = time.time()
            # training
            net.train()
            for iteration in range(train_pos_idx_generator.num_iterations()):
                # forward
                t0 = time.time()

                train_pos_idx_batch = train_pos_idx_generator.next()
                train_pos_idx_batch.sort()
                train_pos_user_artist_batch = train_pos_user_artist[
                    train_pos_idx_batch].tolist()
                train_neg_idx_batch = np.random.choice(
                    len(train_neg_user_artist), len(train_pos_idx_batch))
                train_neg_idx_batch.sort()
                train_neg_user_artist_batch = train_neg_user_artist[
                    train_neg_idx_batch].tolist()

                train_pos_g_lists, train_pos_indices_lists, train_pos_idx_batch_mapped_lists = parse_minibatch_LastFM(
                    adjlists_ua, edge_metapath_indices_list_ua,
                    train_pos_user_artist_batch, device, neighbor_samples,
                    use_masks, num_user)
                train_neg_g_lists, train_neg_indices_lists, train_neg_idx_batch_mapped_lists = parse_minibatch_LastFM(
                    adjlists_ua, edge_metapath_indices_list_ua,
                    train_neg_user_artist_batch, device, neighbor_samples,
                    no_masks, num_user)

                t1 = time.time()
                dur1.append(t1 - t0)

                [pos_embedding_user, pos_embedding_artist], _ = net(
                    (train_pos_g_lists, features_list, type_mask,
                     train_pos_indices_lists,
                     train_pos_idx_batch_mapped_lists))
                [neg_embedding_user, neg_embedding_artist], _ = net(
                    (train_neg_g_lists, features_list, type_mask,
                     train_neg_indices_lists,
                     train_neg_idx_batch_mapped_lists))
                pos_embedding_user = pos_embedding_user.view(
                    -1, 1, pos_embedding_user.shape[1])
                pos_embedding_artist = pos_embedding_artist.view(
                    -1, pos_embedding_artist.shape[1], 1)
                neg_embedding_user = neg_embedding_user.view(
                    -1, 1, neg_embedding_user.shape[1])
                neg_embedding_artist = neg_embedding_artist.view(
                    -1, neg_embedding_artist.shape[1], 1)
                pos_out = torch.bmm(pos_embedding_user, pos_embedding_artist)
                neg_out = -torch.bmm(neg_embedding_user, neg_embedding_artist)
                train_loss = -torch.mean(
                    F.logsigmoid(pos_out) + F.logsigmoid(neg_out))

                t2 = time.time()
                dur2.append(t2 - t1)

                # autograd
                optimizer.zero_grad()
                train_loss.backward()
                optimizer.step()

                t3 = time.time()
                dur3.append(t3 - t2)

                # print training info
                if iteration % 100 == 0:
                    print(
                        'Epoch {:05d} | Iteration {:05d} | Train_Loss {:.4f} | Time1(s) {:.4f} | Time2(s) {:.4f} | Time3(s) {:.4f}'
                        .format(epoch, iteration, train_loss.item(),
                                np.mean(dur1), np.mean(dur2), np.mean(dur3)))
            # validation
            net.eval()
            val_loss = []
            with torch.no_grad():
                for iteration in range(val_idx_generator.num_iterations()):
                    # forward
                    val_idx_batch = val_idx_generator.next()
                    val_pos_user_artist_batch = val_pos_user_artist[
                        val_idx_batch].tolist()
                    val_neg_user_artist_batch = val_neg_user_artist[
                        val_idx_batch].tolist()
                    val_pos_g_lists, val_pos_indices_lists, val_pos_idx_batch_mapped_lists = parse_minibatch_LastFM(
                        adjlists_ua, edge_metapath_indices_list_ua,
                        val_pos_user_artist_batch, device, neighbor_samples,
                        no_masks, num_user)
                    val_neg_g_lists, val_neg_indices_lists, val_neg_idx_batch_mapped_lists = parse_minibatch_LastFM(
                        adjlists_ua, edge_metapath_indices_list_ua,
                        val_neg_user_artist_batch, device, neighbor_samples,
                        no_masks, num_user)

                    [pos_embedding_user, pos_embedding_artist], _ = net(
                        (val_pos_g_lists, features_list, type_mask,
                         val_pos_indices_lists,
                         val_pos_idx_batch_mapped_lists))
                    [neg_embedding_user, neg_embedding_artist], _ = net(
                        (val_neg_g_lists, features_list, type_mask,
                         val_neg_indices_lists,
                         val_neg_idx_batch_mapped_lists))
                    pos_embedding_user = pos_embedding_user.view(
                        -1, 1, pos_embedding_user.shape[1])
                    pos_embedding_artist = pos_embedding_artist.view(
                        -1, pos_embedding_artist.shape[1], 1)
                    neg_embedding_user = neg_embedding_user.view(
                        -1, 1, neg_embedding_user.shape[1])
                    neg_embedding_artist = neg_embedding_artist.view(
                        -1, neg_embedding_artist.shape[1], 1)

                    pos_out = torch.bmm(pos_embedding_user,
                                        pos_embedding_artist)
                    neg_out = -torch.bmm(neg_embedding_user,
                                         neg_embedding_artist)
                    val_loss.append(-torch.mean(
                        F.logsigmoid(pos_out) + F.logsigmoid(neg_out)))
                val_loss = torch.mean(torch.tensor(val_loss))
            t_end = time.time()
            # print validation info
            print('Epoch {:05d} | Val_Loss {:.4f} | Time(s) {:.4f}'.format(
                epoch, val_loss.item(), t_end - t_start))
            # early stopping
            early_stopping(val_loss, net)
            if early_stopping.early_stop:
                print('Early stopping!')
                break

        test_idx_generator = index_generator(
            batch_size=batch_size,
            num_data=len(test_pos_user_artist),
            shuffle=False)
        net.load_state_dict(
            torch.load('checkpoint/checkpoint_{}.pt'.format(save_postfix)))
        net.eval()
        pos_proba_list = []
        neg_proba_list = []
        with torch.no_grad():
            for iteration in range(test_idx_generator.num_iterations()):
                # forward
                test_idx_batch = test_idx_generator.next()
                test_pos_user_artist_batch = test_pos_user_artist[
                    test_idx_batch].tolist()
                test_neg_user_artist_batch = test_neg_user_artist[
                    test_idx_batch].tolist()
                test_pos_g_lists, test_pos_indices_lists, test_pos_idx_batch_mapped_lists = parse_minibatch_LastFM(
                    adjlists_ua, edge_metapath_indices_list_ua,
                    test_pos_user_artist_batch, device, neighbor_samples,
                    no_masks, num_user)
                test_neg_g_lists, test_neg_indices_lists, test_neg_idx_batch_mapped_lists = parse_minibatch_LastFM(
                    adjlists_ua, edge_metapath_indices_list_ua,
                    test_neg_user_artist_batch, device, neighbor_samples,
                    no_masks, num_user)

                [pos_embedding_user, pos_embedding_artist], _ = net(
                    (test_pos_g_lists, features_list, type_mask,
                     test_pos_indices_lists, test_pos_idx_batch_mapped_lists))
                [neg_embedding_user, neg_embedding_artist], _ = net(
                    (test_neg_g_lists, features_list, type_mask,
                     test_neg_indices_lists, test_neg_idx_batch_mapped_lists))
                pos_embedding_user = pos_embedding_user.view(
                    -1, 1, pos_embedding_user.shape[1])
                pos_embedding_artist = pos_embedding_artist.view(
                    -1, pos_embedding_artist.shape[1], 1)
                neg_embedding_user = neg_embedding_user.view(
                    -1, 1, neg_embedding_user.shape[1])
                neg_embedding_artist = neg_embedding_artist.view(
                    -1, neg_embedding_artist.shape[1], 1)

                pos_out = torch.bmm(pos_embedding_user,
                                    pos_embedding_artist).flatten()
                neg_out = torch.bmm(neg_embedding_user,
                                    neg_embedding_artist).flatten()
                pos_proba_list.append(torch.sigmoid(pos_out))
                neg_proba_list.append(torch.sigmoid(neg_out))
            y_proba_test = torch.cat(pos_proba_list + neg_proba_list)
            y_proba_test = y_proba_test.cpu().numpy()
        auc = roc_auc_score(y_true_test, y_proba_test)
        ap = average_precision_score(y_true_test, y_proba_test)
        print('Link Prediction Test')
        print('AUC = {}'.format(auc))
        print('AP = {}'.format(ap))
        auc_list.append(auc)
        ap_list.append(ap)

    print('----------------------------------------------------------------')
    print('Link Prediction Tests Summary')
    print('AUC_mean = {}, AUC_std = {}'.format(np.mean(auc_list),
                                               np.std(auc_list)))
    print('AP_mean = {}, AP_std = {}'.format(np.mean(ap_list),
                                             np.std(ap_list)))
Ejemplo n.º 12
0
def train(rank, world_size, args, cfg):
    dist.init_process_group(backend='nccl',
                            init_method=args.init_method,
                            world_size=world_size,
                            rank=rank)
    # dist.init_process_group(backend='nccl', rank=rank, )
    torch.cuda.set_device(args.local_rank)

    seed = int(time.time() * 256)
    torch.manual_seed(seed)

    logger = logging.getLogger(__name__)
    logging.basicConfig(level=20, format='%(asctime)s - %(message)s')

    # ================================================
    # 2) get data and load data
    # ================================================
    train_dataloader = construct_loader(cfg, 'train')
    val_dataloader = construct_loader(cfg, 'val')

    # ================================================
    # 3) init model/loss/optimizer
    # ================================================

    model = build_model(cfg)
    model.cuda()
    optimizer = optim.Adam(model.parameters(),
                           lr=cfg.SOLVER.BASE_LR,
                           weight_decay=cfg.SOLVER.WEIGHT_DECAY)

    model, optimizer = amp.initialize(model, optimizer)

    model = torch.nn.parallel.DistributedDataParallel(model)

    cudnn.benchmark = True

    loss_function = F.cross_entropy().cuda()

    # ================================================
    # 4) train loop
    # ================================================
    print("|------------------------|")
    print("| train on train dataset |")
    print("|------------------------|")

    early_stopping = EarlyStopping(20,
                                   verbose=True,
                                   path='checkpoints/model.pth',
                                   trace_func=logging.info)
    writer = SummaryWriter()
    start_time = time.time()
    for epoch in range(args.n_epochs):
        train_loss_lst = []
        val_loss_lst = []
        train_acc_lst = []
        val_acc_lst = []
        model.train()
        for i, train_dataset in enumerate(train_dataloader):
            train_data, train_label = train_dataset

            if cfg.NUM_GPU:
                train_data.cuda(non_blocking=True)
                train_label.cuda(non_blocking=True)
                torch.distributed.barrier()

            optimizer.zero_grad()  #

            # forward + backward + optimize
            train_outputs = model(train_data)
            train_loss = loss_function(train_outputs, train_label.long())

            adjust_lr(optimizer, epoch, cfg.SOLVER.BASE_LR)

            with amp.scale_loss(train_loss, optimizer) as scaled_loss:
                scaled_loss.backward()

            # train_loss.backward()
            optimizer.step()

            train_acc = accuracy(train_outputs, train_label.long())

            train_acc_lst.append(train_acc)
            train_loss_lst.append(train_loss)

        train_avg_loss = sum(train_loss_lst) / i
        train_avg_acc = sum(train_acc_lst) / i
        # ================================================
        # 5) evaluate on validation dataset
        # ================================================

        model.eval()
        for v, val_dataset in enumerate(val_dataloader):
            val_data, val_label = val_dataset

            val_outputs = model(val_data)
            val_loss = F.cross_entropy(val_outputs, val_label.long())
            val_acc = accuracy(val_outputs, val_label)

            val_acc_lst.append(val_acc)
            val_loss_lst.append(val_loss)

        val_avg_acc = sum(val_acc_lst) / v
        val_avg_loss = sum(val_loss_lst) / v
        logging.info(
            "Train Phase, Epoch:{}, Train_avg_loss:{}, Val_avg_loss:{},Train_avg_acc:{}, Val_avg_acc:{}"
            .format(epoch, train_avg_loss, val_avg_loss, train_avg_acc,
                    val_avg_acc))
        early_stopping(val_avg_loss, model)
        if early_stopping.early_stop:
            print('|------- Early Stop ------|')
            end_time = time.time()
            logging.info("Total spend time:{}s".format(end_time - start_time))
            break

        writer.add_scalar('Loss', train_avg_loss, epoch)
        writer.add_scalar('Accuracy', train_avg_acc, epoch)
        logging.FileHandler('logs/{}_log.txt'.format(
            time.strftime(r"%Y-%m-%d-%H_%M_%S", time.localtime())))
def train(model, model_type,criterion, optimizer, activate_early_stopping,scheduler,
          train_loader, val_loader,
          n_epochs=1, gpu=False, print_every=1,print_validation_every=1,
          earl_stopping_patience = 3):
    """Function to train deep learning model
    Input : model = model to train, model_type = (string) name of model type, criterion =  loss function to use for training, optimizer = optimizer to use for training, 
    activate_early_stopping = (boolean) active early stop if True, scheduler = scheduler to use for training, train_loader = (DataLoader) train set, val_loader = (DataLoader) validation set,
    n_epochs = (integer) number of epochs, gpu = (boolean) use GPU if True, print_every = (integer) periodicity for printing training loss and accuracy, 
    print_validation_every = (integer) periodicity for printing validation loss and accuracy, earl_stopping_patience = (integer) number of epochs that produced the monitored quantity 
    with no improvement after which training will be stopped """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    if activate_early_stopping:
        early_stopping = EarlyStopping(patience = earl_stopping_patience, verbose=True)

    #for plotting
    hist = {'loss':[],'accuracy':[]}
    val_hist = {'loss':[],'accuracy':[]}
    for ep in range(n_epochs):
        running_loss = 0 #used by the scheduler
        running_accuracy = 0

        if model_type=='rnn':
            h = model.init_hidden(train_loader.batch_size)

        for it, data in enumerate(train_loader):
            #extract right info from data
            if model_type=='bert':
                seq,attn_masks,labels = data
            elif model_type in ['rnn','cnn']:
                seq,attn_masks,labels = data[0],torch.ones(1),data[1] #attn_mask is not important here
            else:
                raise ValueError(f'Model type "{model_type}" not supported.')

            labels = labels.type(torch.LongTensor)
            #Clear gradients
            optimizer.zero_grad()
            #Converting these to cuda tensors
            if gpu:
              seq, attn_masks, labels = seq.to(device), attn_masks.to(device), labels.to(device)
            #Obtaining the logits from the model
            if model_type == 'rnn':
                h = tuple([e.data for e in h])
                output,h = model(seq,h)
            elif model_type == 'cnn':
                output = model(seq)
            elif model_type =='bert':
                output,attentions = model(seq, attn_masks)
            else:
                raise ValueError(f'Model type "{model_type}" not supported.')

            #Computing loss
            loss = criterion(output.squeeze(-1), labels)
            running_loss += loss
            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            optimizer.step()

            #accuracy update
            accuracy = torch.sum(torch.argmax(output,dim=1)==labels)/float(labels.size(0))
            running_accuracy += accuracy

            if (it + 1) % print_every == 0:
                print("Iteration {} of epoch {} complete. Loss : {}, Accuracy {} ".format(it+1, ep+1, loss.item(),accuracy))

        #scheduler step
        if not scheduler is None:
            scheduler.step(running_loss)

        #update training history
        hist['loss'].append(running_loss/it) #mean
        hist['accuracy'].append(running_accuracy/it) #mean

        #VALIDATION
        model.eval()
        n_batch_validation = 0
        loss_validation = 0
        accuracy_validation = 0
        #init hidden if rnn
        if model_type == 'rnn':
            val_h = model.init_hidden(val_loader.batch_size)

        for it, data in enumerate(val_loader):

            #extract right info from data
            if model_type=='bert':
                seq,attn_masks,labels = data
            elif model_type in ['rnn','cnn']:
                seq,attn_masks,labels = data[0],torch.ones(1),data[1] #attn_mask is not important here
            else:
                raise ValueError(f'Model type "{model_type}" not supported.')

            labels = labels.type(torch.LongTensor)
            if gpu:
              seq, attn_masks, labels = seq.to(device), attn_masks.to(device), labels.to(device)
            #Obtaining the logits from the model
            if model_type == 'rnn':
                val_h = tuple([each.data for each in val_h])
                out, val_h = model(seq, val_h)
            elif model_type == 'cnn':
                out = model(seq)
            elif model_type=='bert':
                out, attentions_val = model(seq, attn_masks)
            else:
                raise ValueError(f'Model type "{model_type}" not supported.')

            n_batch_validation+=1
            #Computing loss
            _loss = float(criterion(out.squeeze(-1), labels))
            #computing scores
            _accu = torch.sum(torch.argmax(out,dim=1)==labels)/float(labels.size(0))
            loss_validation += _loss
            accuracy_validation += _accu
        #validation printing
        if ep % print_validation_every==0:
            print("EVALUATION Validation set : mean loss {} || mean accuracy {}".format(loss_validation/n_batch_validation, accuracy_validation/n_batch_validation))

        val_hist['loss'].append(loss_validation/n_batch_validation)
        val_hist['accuracy'].append(accuracy_validation/n_batch_validation)
        #early stopping
        if activate_early_stopping:
            early_stopping(loss_validation, model)
            if early_stopping.early_stop:
                print("Early stopping")
                break
        model.train()

    #plot history
    fig,(ax1,ax2) = plt.subplots(1,2,figsize=(14,5))
    ax1.plot(hist['loss'],label='train')
    ax1.plot(val_hist['loss'],label='validation')
    ax1.set_title('Evolution of training loss')
    ax1.legend()

    ax2.plot(hist['accuracy'],label='train')
    ax2.plot(val_hist['accuracy'],label='validation')
    ax2.set_title('Evolution of training accuracy')
    ax2.legend()

    plt.tight_layout()
    plt.show()
Ejemplo n.º 14
0
def main():

    #define train set transformations
    train_transform = transforms.Compose([
        transforms.RandomRotation(5),
        transforms.RandomHorizontalFlip(0.3),
        transforms.RandomVerticalFlip(0.3),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    #define validation set transformations
    valid_transforms = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    train_dataset = ImageFolder('data/train_images/',
                                transform=train_transform)
    valid_dataset = ImageFolder('data/train_images/',
                                transform=valid_transforms)

    #split data : get indices of train and valid set
    valid_size = 0.2
    data_size = len(train_dataset)
    indices = list(range(data_size))

    #split indices
    train_indx, valid_indx, _, _ = train_test_split(indices,
                                                    indices,
                                                    test_size=valid_size,
                                                    random_state=44)

    #create samplers from indices for train and validation sets.
    train_sampler = SubsetRandomSampler(train_indx)
    valid_sampler = SubsetRandomSampler(valid_indx)

    #create dataloaders
    train_loader = DataLoader(train_dataset,
                              batch_size=TRAIN_BATCH_SIZE,
                              sampler=train_sampler)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=VALID_BATCH_SIZE,
                              sampler=valid_sampler)

    #create model
    model = MODEL_DISPATCHER[BASE_MODEL](pretrained=True)
    #model.load_state_dict(torch.load("model/checkpoints/checkpoint.pt"))
    model.to(DEVICE)

    optimizer = optim.Adam(model.parameters())
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                     factor=0.2,
                                                     patience=5)
    early_stopping = EarlyStopping(patience=7, verbose=True)
    criterion = nn.CrossEntropyLoss()

    for e in range(EPOCHS):
        train(train_loader, model, optimizer, criterion)
        val_score = evaluate(valid_loader, model, criterion)
        scheduler.step(val_score)
        early_stopping(val_score, model)
        if early_stopping.early_stop:
            print("Early stopping!")
            break
Ejemplo n.º 15
0
def main():
    fold = str(config.fold)
    # 3.1 创建必要的文件夹
    if not os.path.exists(config.submit):
        os.mkdir(config.submit)
    if not os.path.exists(config.weights):
        os.mkdir(config.weights)
    if not os.path.exists(config.best_models):
        os.mkdir(config.best_models)
    if not os.path.exists(config.logs):
        os.mkdir(config.logs)
    if not os.path.exists(config.weights + config.model_name + os.sep +
                          str(fold) + os.sep):
        os.makedirs(config.weights + config.model_name + os.sep + str(fold) +
                    os.sep)
    if not os.path.exists(config.best_models + config.model_name + os.sep +
                          str(fold) + os.sep):
        os.makedirs(config.best_models + config.model_name + os.sep +
                    str(fold) + os.sep)
    if not os.path.exists(config.submit + config.model_name + os.sep +
                          str(fold) + os.sep):
        os.makedirs(config.submit + config.model_name + os.sep + str(fold) +
                    os.sep)
    if not os.path.exists(config.logs + config.model_name + os.sep +
                          str(fold) + os.sep):
        os.makedirs(config.logs + config.model_name + os.sep + str(fold) +
                    os.sep)

    # 3.2 获取模型和优化器,并初始化损失函数
    # model = resnet18(num_classes=len(config.class_list))
    # model = seresnet18()
    model = resnet18()
    model.cuda()
    # 初始化正则化
    # if config.weight_decay > 0:
    #     reg_loss = Regularization(model, config.weight_decay, p=1).cuda()  # L1/L2正则
    # else:
    #     print("no regularization")
    optimizer = optim.SGD(model.parameters(),
                          lr=config.lr,
                          momentum=config.momentum,
                          weight_decay=config.weight_decay)
    # optimizer = optim.Adam(model.parameters(), lr=config.lr, amsgrad=True, weight_decay=config.weight_decay)

    # 3.4 重新启动训练过程
    criterion = nn.CrossEntropyLoss().cuda()
    start_epoch = 0
    best_precision1 = 0
    resume = False
    if resume:
        checkpoint = torch.load(config.best_models + config.model_name +
                                os.sep + str(fold) + "/model_best.pth.tar")
        start_epoch = checkpoint["epoch"]
        fold = checkpoint["fold"]
        best_precision1 = checkpoint["best_precision1"]
        model.load_state_dict(checkpoint["state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer"])

    # 3.5 获取文件和分割数据集
    train_data_list, val_data_list = random_split_ratio(config.data_root,
                                                        config.class_list,
                                                        split_rate=0.2)
    # print(len(val_data_list))

    # 3.6 加载数据为DataLoader
    train_dataloader = DataLoader(CreateImgDataset(train_data_list),
                                  batch_size=config.batch_size,
                                  shuffle=True,
                                  collate_fn=collate_fn,
                                  pin_memory=True,
                                  num_workers=4)
    val_dataloader = DataLoader(CreateImgDataset(val_data_list, train=False),
                                batch_size=1,
                                shuffle=True,
                                collate_fn=collate_fn,
                                pin_memory=False,
                                num_workers=4)

    # 4.1 初始化学习率调整
    scheduler = optim.lr_scheduler.StepLR(optimizer,
                                          step_size=config.step_size,
                                          gamma=config.gamma)

    # 4.2 定义指标
    train_losses = AverageMeter()
    train_top1 = AverageMeter()
    valid_loss = [np.inf, 0, 0]
    model.train()

    # 5. 训练模块
    start = timer()
    train_list = []
    valid_list = []
    label_list = []
    y_pred = []
    target_list = []
    # 5.1 初始化早停止
    early_stopping = EarlyStopping(patience=config.patience, verbose=True)
    for epoch in range(start_epoch, config.epochs):
        # 5.2 学习率调整
        if get_learning_rate(optimizer) > 1e-8:
            scheduler.step(epoch)
        # 5.3 全局迭代
        train_progressor = ProgressBar(mode="Train",
                                       epoch=epoch,
                                       total_epoch=config.epochs,
                                       model_name=config.model_name,
                                       path=config.logs + config.model_name +
                                       os.sep + str(fold) + os.sep,
                                       total=len(train_dataloader))
        for batch, (input, target) in enumerate(train_dataloader):
            train_progressor.current = batch
            # 5.4 数据输入网络训练
            model.train()
            input = Variable(input).cuda()
            target = Variable(torch.from_numpy(np.array(target)).long()).cuda()
            output = model(input)
            # 5.5 计算训练损失
            loss = criterion(output, target)
            # if config.weight_decay > 0:
            #     loss = loss + reg_loss(model)
            # 5.6 计算准确率
            precision1_train, precision2_train = accuracy(output,
                                                          target,
                                                          topk=(1, 2))
            train_losses.update(loss.item(), input.size(0))
            train_top1.update(precision1_train[0], input.size(0))
            train_progressor.current_loss = train_losses.avg
            train_progressor.current_top1 = train_top1.avg
            # 5.7 反向传播
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_progressor()

        train_progressor.done()
        train_list.append([train_losses.avg, train_top1.avg.cpu().data.item()])

        # 6 评估每个epoch
        valid, target_list_t, label_list_t, y_pred_t = evaluate(
            val_dataloader, model, criterion, fold, epoch)
        valid_list.append(valid)
        # 6.1 保存最优模型
        is_best = valid[1] > best_precision1
        best_precision1 = max(valid[1], best_precision1)
        if is_best:
            target_list = target_list_t
            label_list = label_list_t
            y_pred = y_pred_t
        save_checkpoint(
            {
                "epoch": epoch + 1,
                "model_name": config.model_name,
                "state_dict": model.state_dict(),
                "best_precision1": best_precision1,
                "optimizer": optimizer.state_dict(),
                "fold": fold,
                "valid_loss": valid[0],
            }, is_best, fold)

        # 6.2 根据验证损失判断早停止
        early_stopping(-valid[1], model)
        # 若满足 early stopping 要求
        if early_stopping.early_stop:
            print("Early stopping")
            # 结束模型训练
            break
    print("训练用时:" + time_to_str((timer() - start), 'min'))
    # 6.3 保存最优模型评估结果到excel
    Y_pred = np.asarray(y_pred)
    f = xlwt.Workbook()
    train_sheet = f.add_sheet(u'train', cell_overwrite_ok=True)  # 创建sheet
    val_sheet = f.add_sheet(u'verify', cell_overwrite_ok=True)
    result_sheet = f.add_sheet(u'result', cell_overwrite_ok=True)
    for i, t in enumerate(train_list):
        train_sheet.write(i, 0, t[0])
        train_sheet.write(i, 1, t[1])
    for i, v in enumerate(valid_list):
        val_sheet.write(i, 0, v[0])
        val_sheet.write(i, 1, v[1])
    for i, r in enumerate(target_list):
        result_sheet.write(i, 0, r)
        result_sheet.write(i, 1, int(label_list[i]))
        for c in range(0, len(config.class_list)):
            result_sheet.write(i, 2 + c, Y_pred[i, c])
    f.save(config.logs + config.model_name + os.sep + str(fold) + os.sep +
           config.model_name + '.xlsx')
    # 6.4 画训练过程曲线与结果图
    plot_training(train_list, valid_list, fold, config.dpi)
    plot_result(config.class_list, fold, best_precision1, target_list,
                label_list, Y_pred, config.dpi)
def train_attention():
    param_class = get_param_class(args.data)
    run_id = args.data + '_' + str(uuid.uuid1())

    dataset_train = LeafDataset(
        data_path=args.dataset_path,
        genotype=param_class.genotype,
        inoculated=param_class.inoculated,
        dai=param_class.dai,
        test_size=param_class.test_size,
        signature_pre_clip=param_class.signature_pre_clip,
        signature_post_clip=param_class.signature_post_clip,
        max_num_balanced_inoculated=param_class.max_num_balanced_inoculated,
        num_samples_file=param_class.num_samples_file,
        split=args.split,
        mode='train',
        superpixel=True,
        bags=True,
        validation=True)  # 50000
    dataset_test = LeafDataset(
        data_path=args.dataset_path,
        genotype=param_class.genotype,
        inoculated=param_class.inoculated,
        dai=param_class.dai,
        test_size=param_class.test_size,
        signature_pre_clip=param_class.signature_pre_clip,
        signature_post_clip=param_class.signature_post_clip,
        max_num_balanced_inoculated=param_class.
        max_num_balanced_inoculated,  # 50000
        num_samples_file=param_class.num_samples_file,
        split=args.split,
        mode="test",
        superpixel=True,
        bags=True,
        validation=True)
    dataset_val = LeafDataset(
        data_path=args.dataset_path,
        genotype=param_class.genotype,
        inoculated=param_class.inoculated,
        dai=param_class.dai,
        test_size=param_class.test_size,
        signature_pre_clip=param_class.signature_pre_clip,
        signature_post_clip=param_class.signature_post_clip,
        max_num_balanced_inoculated=param_class.
        max_num_balanced_inoculated,  # 50000
        num_samples_file=param_class.num_samples_file,
        split=args.split,
        mode="validation",
        superpixel=True,
        bags=True,
        validation=True)

    print("Number of samples train", len(dataset_train))
    print("Number of samples test", len(dataset_test))
    print("Number of samples val", len(dataset_val))
    dataloader = DataLoader(dataset_train,
                            batch_size=1,
                            shuffle=True,
                            num_workers=0)
    dataloader_test = DataLoader(dataset_test,
                                 batch_size=1,
                                 shuffle=False,
                                 num_workers=0,
                                 drop_last=False)
    dataloader_val = DataLoader(dataset_val,
                                batch_size=1,
                                shuffle=False,
                                num_workers=0,
                                drop_last=False)

    hyperparams = dataset_train.hyperparams
    print("Number of batches train", len(dataloader))
    print("Number of batches test", len(dataloader_test))
    print("Number of batches val", len(dataloader_val))

    # Original class counts train: 67578 264112
    # Original class counts test: 68093 263597
    hyperparams['num_classes'] = param_class.num_classes
    hyperparams['hidden_layer_size'] = param_class.hidden_layer_size
    hyperparams['num_heads'] = param_class.num_heads
    hyperparams['lr'] = args.lr
    hyperparams['num_epochs'] = args.num_epochs
    hyperparams['lr_scheduler_steps'] = args.lr_scheduler_steps

    model = SANNetwork(input_size=dataset_train.input_size,
                       num_classes=hyperparams['num_classes'],
                       hidden_layer_size=hyperparams['hidden_layer_size'],
                       dropout=0.9,
                       num_heads=hyperparams['num_heads'],
                       device="cuda")

    #model = ConvNetBarley(elu=False, avgpool=False, nll=False, num_classes=param_class.num_classes)
    #model = CNNModel(num_classes=param_class.num_classes)

    num_epochs = hyperparams['num_epochs']
    optimizer = torch.optim.Adam(model.parameters(), lr=hyperparams['lr'])

    scheduler = torch.optim.lr_scheduler.StepLR(
        optimizer, hyperparams['lr_scheduler_steps'], gamma=0.5, last_epoch=-1)
    num_params = sum(p.numel() for p in model.parameters())
    print("Number of parameters {}".format(num_params))
    print("Starting training for {} epochs".format(num_epochs))
    save_dir = "./uv_dataset/results_cv/"
    writer = SummaryWriter(log_dir=save_dir + run_id,
                           comment="_" + "_id_{}".format(run_id))

    #device = "cuda"
    #model.to(device)

    #balanced_loss_weight = torch.tensor([0.75, 0.25], device=device)  # torch.tensor([0.75, 0.25], device=device)
    balanced_loss_weight = torch.tensor([0.75, 0.25])
    crit = torch.nn.CrossEntropyLoss(weight=balanced_loss_weight)
    best_acc = 0
    early_stopping = EarlyStopping(patience=60, verbose=True)
    for epoch in tqdm(range(num_epochs)):
        setproctitle("Gerste_MIL" + args.mode +
                     " | epoch {} of {}".format(epoch + 1, num_epochs))
        losses_per_batch = []
        correct = 0
        target, pred = [], []
        total = 0
        for i, (features, labels) in enumerate(dataloader):
            #labels = labels[2]
            features = features.float()  #.to(device)
            #features = features.permute((1, 0, 2, 3, 4))
            labels = labels.long()  #.to(device)
            model.train()
            outputs, _ = model.forward(features)
            outputs = outputs.view(labels.shape[0], -1)
            labels = labels.view(-1)
            loss = crit(outputs, labels)
            optimizer.zero_grad()
            _, predicted = torch.max(outputs.data, 1)
            batch_pred, batch_target = getPredAndTarget(outputs, labels)
            target.append(batch_target)
            pred.append(batch_pred)
            # correct += balanced_accuracy(batch_target, batch_pred) * labels.size(0)  # mean
            # correct += (predicted == labels).sum().item()
            total += labels.size(0)
            loss.backward()
            optimizer.step()
            losses_per_batch.append(float(loss))
        mean_loss = np.mean(losses_per_batch)
        correct = balanced_accuracy(target, pred)
        writer.add_scalar('Loss/train', mean_loss, epoch)
        writer.add_scalar('Accuracy/train', 100 * correct, epoch)
        print("Epoch {}, mean loss per batch {}, train acc {}".format(
            epoch, mean_loss, 100 * correct))
        if (epoch + 1) % args.test_epoch == 0 or epoch + 1 == num_epochs:
            # Testing

            correct_test = 0
            target, pred = [], []
            total = 0
            model.eval()
            losses_per_batch = []
            attention_weights = []
            with torch.no_grad():
                for i, (features, labels) in enumerate(dataloader_test):
                    #labels = labels[2]
                    features = features.float()  #.to(device)
                    #features = features.permute((1, 0, 2, 3, 4))
                    labels = labels.long()  #.to(device)
                    outputs, att = model.forward(features)
                    attention_weights.append(att.squeeze(0).numpy())
                    outputs = outputs.view(labels.shape[0], -1)
                    labels = labels.view(-1)
                    loss = crit(outputs, labels)
                    losses_per_batch.append(float(loss))
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    batch_pred, batch_target = getPredAndTarget(
                        outputs, labels)
                    target.append(batch_target)
                    pred.append(batch_pred)
                    # correct_test += balanced_accuracy(batch_target, batch_pred) * labels.size(0)
                    # correct += (predicted == labels).sum().item()
                mean_loss = np.mean(losses_per_batch)
                print(target, pred)
                correct_test = balanced_accuracy(target, pred)
                writer.add_scalar('Loss/test', mean_loss, epoch)
                np.save('attention_weights.npy', attention_weights)
            print(
                'Accuracy, mean loss per batch of the network on the test samples: {} %, {}'
                .format(100 * correct_test, mean_loss))
            writer.add_scalar('Accuracy/test', 100 * correct_test, epoch)

            # Validation
            correct_val = 0
            target, pred = [], []
            total = 0
            losses_per_batch = []
            attention_weights_val = []
            with torch.no_grad():
                for i, (features, labels) in enumerate(dataloader_val):
                    #labels = labels[2]
                    features = features.float()  #.to(device)
                    #features = features.permute((1, 0, 2, 3, 4))
                    labels = labels.long()  #.to(device)
                    outputs, att = model.forward(features)
                    attention_weights_val.append(att.squeeze(0).numpy())
                    outputs = outputs.view(labels.shape[0], -1)
                    labels = labels.view(-1)
                    loss = crit(outputs, labels)
                    losses_per_batch.append(float(loss))
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    batch_pred, batch_target = getPredAndTarget(
                        outputs, labels)
                    target.append(batch_target)
                    pred.append(batch_pred)
                    # correct_val += balanced_accuracy(batch_target, batch_pred) * labels.size(0)
                    # correct += (predicted == labels).sum().item()
                mean_loss = np.mean(losses_per_batch)
                print(target, pred)
                correct_val = balanced_accuracy(target, pred)
                writer.add_scalar('Loss/val', mean_loss, epoch)
                #np.save('attention_weights_val.npy', attention_weights_val)
            print(
                'Accuracy, mean loss per batch of the network on the validation samples: {} %, {}'
                .format(100 * correct_val, mean_loss))
            writer.add_scalar('Accuracy/val', 100 * correct_val, epoch)
            early_stopping(mean_loss, model)

            if early_stopping.early_stop:
                print("Early stopping")
                break

            if (correct_test) >= best_acc:
                best_acc = (correct_test)
            model.train()

        scheduler.step()
Ejemplo n.º 17
0
    torch.manual_seed(0)
    np.random.seed(0)
    device = torch.device(
        "cuda:" +
        str(args.device)) if torch.cuda.is_available() else torch.device("cpu")
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(0)

    model = FECNet(pretrained=args.pretrained)
    Num_Param = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print("Number of Trainable Parameters= %d" % (Num_Param))

    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    # scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.9)

    early_stopping = EarlyStopping(patience=50, verbose=True)

    running_loss = 0
    print_per_epoch = 1
    correct = 0
    Len = 0

    tr_dataloader, val_dataloader = DATALoader(csv_file='data/labels.csv',
                                               args=args)

    for epoch in range(args.epochs):
        # scheduler.step()

        # Training
        for i_batch, sample_batched in enumerate(tr_dataloader):
            model.zero_grad()
Ejemplo n.º 18
0
def main():
    cmd_ls = sys.argv[1:]
    cmd = generate_cmd(cmd_ls)
    if "--freeze_bn False" in cmd:
        opt.freeze_bn = False
    if "--addDPG False" in cmd:
        opt.addDPG = False

    print(
        "----------------------------------------------------------------------------------------------------"
    )
    print("This is the model with id {}".format(save_ID))
    print(opt)
    print("Training backbone is: {}".format(opt.backbone))
    dataset_str = ""
    for k, v in config.train_info.items():
        dataset_str += k
        dataset_str += ","
    print("Training data is: {}".format(dataset_str[:-1]))
    print("Warm up end at {}".format(warm_up_epoch))
    for k, v in config.bad_epochs.items():
        if v > 1:
            raise ValueError("Wrong stopping accuracy!")
    print(
        "----------------------------------------------------------------------------------------------------"
    )

    exp_dir = os.path.join("exp/{}/{}".format(folder, save_ID))
    log_dir = os.path.join(exp_dir, "{}".format(save_ID))
    os.makedirs(log_dir, exist_ok=True)
    log_name = os.path.join(log_dir, "{}.txt".format(save_ID))
    train_log_name = os.path.join(log_dir, "{}_train.xlsx".format(save_ID))
    bn_file = os.path.join(log_dir, "{}_bn.txt".format(save_ID))
    # Prepare Dataset

    # Model Initialize
    if device != "cpu":
        m = createModel(cfg=model_cfg).cuda()
    else:
        m = createModel(cfg=model_cfg).cpu()
    print(m, file=open("model.txt", "w"))

    begin_epoch = 0
    pre_train_model = opt.loadModel
    flops = print_model_param_flops(m)
    print("FLOPs of current model is {}".format(flops))
    params = print_model_param_nums(m)
    print("Parameters of current model is {}".format(params))
    inf_time = get_inference_time(m,
                                  height=opt.outputResH,
                                  width=opt.outputResW)
    print("Inference time is {}".format(inf_time))
    print(
        "----------------------------------------------------------------------------------------------------"
    )

    if opt.freeze > 0 or opt.freeze_bn:
        if opt.backbone == "mobilenet":
            feature_layer_num = 155
            feature_layer_name = "features"
        elif opt.backbone == "seresnet101":
            feature_layer_num = 327
            feature_layer_name = "preact"
        elif opt.backbone == "seresnet18":
            feature_layer_num = 75
            feature_layer_name = "seresnet18"
        elif opt.backbone == "shufflenet":
            feature_layer_num = 167
            feature_layer_name = "shuffle"
        else:
            raise ValueError("Not a correct name")

        feature_num = int(opt.freeze * feature_layer_num)

        for idx, (n, p) in enumerate(m.named_parameters()):
            if len(p.shape) == 1 and opt.freeze_bn:
                p.requires_grad = False
            elif feature_layer_name in n and idx < feature_num:
                p.requires_grad = False
            else:
                p.requires_grad = True

    writer = SummaryWriter('exp/{}/{}'.format(folder, save_ID), comment=cmd)

    if device != "cpu":
        # rnd_inps = Variable(torch.rand(3, 3, 224, 224), requires_grad=True).cuda()
        rnd_inps = torch.rand(3, 3, 224, 224).cuda()
    else:
        rnd_inps = torch.rand(3, 3, 224, 224)
        # rnd_inps = Variable(torch.rand(3, 3, 224, 224), requires_grad=True)
    try:
        writer.add_graph(m, (rnd_inps, ))
    except:
        pass

    shuffle_dataset = False
    for k, v in config.train_info.items():
        if k not in open_source_dataset:
            shuffle_dataset = True

    train_dataset = MyDataset(config.train_info, train=True)
    val_dataset = MyDataset(config.train_info, train=False)
    if shuffle_dataset:
        val_dataset.img_val, val_dataset.bbox_val, val_dataset.part_val = \
            train_dataset.img_val, train_dataset.bbox_val, train_dataset.part_val

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=opt.trainBatch,
                                               shuffle=True,
                                               num_workers=opt.trainNW,
                                               pin_memory=True)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=opt.validBatch,
                                             shuffle=True,
                                             num_workers=opt.valNW,
                                             pin_memory=True)

    # for k, v in config.train_info.items():
    #     train_dataset = Mscoco([v[0], v[1]], train=True, val_img_num=v[2])
    #     val_dataset = Mscoco([v[0], v[1]], train=False, val_img_num=v[2])
    #
    # train_loaders[k] = torch.utils.data.DataLoader(
    #     train_dataset, batch_size=config.train_batch, shuffle=True, num_workers=config.train_mum_worker,
    #     pin_memory=True)
    #
    # val_loaders[k] = torch.utils.data.DataLoader(
    #     val_dataset, batch_size=config.val_batch, shuffle=False, num_workers=config.val_num_worker, pin_memory=True)
    #
    # train_loader = torch.utils.data.DataLoader(
    #         train_dataset, batch_size=config.train_batch, shuffle=True, num_workers=config.train_mum_worker,
    #         pin_memory=True)
    # val_loader = torch.utils.data.DataLoader(
    #         val_dataset, batch_size=config.val_batch, shuffle=False, num_workers=config.val_num_worker, pin_memory=True)

    # assert train_loaders != {}, "Your training data has not been specific! "

    os.makedirs("exp/{}/{}".format(folder, save_ID), exist_ok=True)
    if pre_train_model:
        if "duc_se.pth" not in pre_train_model:
            if "pretrain" not in pre_train_model:
                try:
                    info_path = os.path.join("exp", folder, save_ID,
                                             "option.pkl")
                    info = torch.load(info_path)
                    opt.trainIters = info.trainIters
                    opt.valIters = info.valIters
                    begin_epoch = int(pre_train_model.split("_")[-1][:-4]) + 1
                except:
                    # begin_epoch = int(pre_train_model.split("_")[-1][:-4]) + 1
                    with open(log_name, "a+") as f:
                        f.write(cmd)

            print('Loading Model from {}'.format(pre_train_model))
            m.load_state_dict(torch.load(pre_train_model))
        else:
            with open(log_name, "a+") as f:
                f.write(cmd)
            print('Loading Model from {}'.format(pre_train_model))
            m.load_state_dict(torch.load(pre_train_model))
            m.conv_out = nn.Conv2d(m.DIM,
                                   opt.kps,
                                   kernel_size=3,
                                   stride=1,
                                   padding=1)
            if device != "cpu":
                m.conv_out.cuda()
            os.makedirs("exp/{}/{}".format(folder, save_ID), exist_ok=True)
    else:
        print('Create new model')
        with open(log_name, "a+") as f:
            f.write(cmd)
            print(opt, file=f)
            f.write("FLOPs of current model is {}\n".format(flops))
            f.write("Parameters of current model is {}\n".format(params))

    with open(os.path.join(log_dir, "tb.py"), "w") as pyfile:
        pyfile.write("import os\n")
        pyfile.write("os.system('conda init bash')\n")
        pyfile.write("os.system('conda activate py36')\n")
        pyfile.write(
            "os.system('tensorboard --logdir=../../../../exp/{}/{}')".format(
                folder, save_ID))

    params_to_update, layers = [], 0
    for name, param in m.named_parameters():
        layers += 1
        if param.requires_grad:
            params_to_update.append(param)
    print("Training {} layers out of {}".format(len(params_to_update), layers))

    if optimize == 'rmsprop':
        optimizer = torch.optim.RMSprop(params_to_update,
                                        lr=opt.LR,
                                        momentum=opt.momentum,
                                        weight_decay=opt.weightDecay)
    elif optimize == 'adam':
        optimizer = torch.optim.Adam(params_to_update,
                                     lr=opt.LR,
                                     weight_decay=opt.weightDecay)
    elif optimize == 'sgd':
        optimizer = torch.optim.SGD(params_to_update,
                                    lr=opt.LR,
                                    momentum=opt.momentum,
                                    weight_decay=opt.weightDecay)
    else:
        raise Exception

    if mix_precision:
        m, optimizer = amp.initialize(m, optimizer, opt_level="O1")

    # Model Transfer
    if device != "cpu":
        m = torch.nn.DataParallel(m).cuda()
        criterion = torch.nn.MSELoss().cuda()
    else:
        m = torch.nn.DataParallel(m)
        criterion = torch.nn.MSELoss()

    # loss, acc = valid(val_loader, m, criterion, optimizer, writer)
    # print('Valid:-{idx:d} epoch | loss:{loss:.8f} | acc:{acc:.4f}'.format(
    #     idx=-1,
    #     loss=loss,
    #     acc=acc
    # ))

    early_stopping = EarlyStopping(patience=opt.patience, verbose=True)
    train_acc, val_acc, train_loss, val_loss, best_epoch, train_dist, val_dist, train_auc, val_auc, train_PR, val_PR = \
        0, 0, float("inf"), float("inf"), 0, float("inf"), float("inf"), 0, 0, 0, 0
    train_acc_ls, val_acc_ls, train_loss_ls, val_loss_ls, train_dist_ls, val_dist_ls, train_auc_ls, val_auc_ls, \
        train_pr_ls, val_pr_ls, epoch_ls, lr_ls = [], [], [], [], [], [], [], [], [], [], [], []
    decay, decay_epoch, lr, i = 0, [], opt.LR, begin_epoch
    stop = False
    m_best = m

    train_log = open(train_log_name, "w", newline="")
    bn_log = open(bn_file, "w")
    csv_writer = csv.writer(train_log)
    csv_writer.writerow(write_csv_title())
    begin_time = time.time()

    os.makedirs("result", exist_ok=True)
    result = os.path.join(
        "result", "{}_result_{}.csv".format(opt.expFolder, config.computer))
    exist = os.path.exists(result)

    # Start Training
    try:
        for i in range(opt.nEpochs)[begin_epoch:]:

            opt.epoch = i
            epoch_ls.append(i)
            train_log_tmp = [save_ID, i, lr]

            log = open(log_name, "a+")
            print('############# Starting Epoch {} #############'.format(i))
            log.write(
                '############# Starting Epoch {} #############\n'.format(i))

            # optimizer, lr = adjust_lr(optimizer, i, config.lr_decay, opt.nEpochs)
            # writer.add_scalar("lr", lr, i)
            # print("epoch {}: lr {}".format(i, lr))

            loss, acc, dist, auc, pr, pt_acc, pt_dist, pt_auc, pt_pr = \
                train(train_loader, m, criterion, optimizer, writer)
            train_log_tmp.append(" ")
            train_log_tmp.append(loss)
            train_log_tmp.append(acc.tolist())
            train_log_tmp.append(dist.tolist())
            train_log_tmp.append(auc)
            train_log_tmp.append(pr)
            for a in pt_acc:
                train_log_tmp.append(a.tolist())
            train_log_tmp.append(" ")
            for d in pt_dist:
                train_log_tmp.append(d.tolist())
            train_log_tmp.append(" ")
            for ac in pt_auc:
                train_log_tmp.append(ac)
            train_log_tmp.append(" ")
            for p in pt_pr:
                train_log_tmp.append(p)
            train_log_tmp.append(" ")

            train_acc_ls.append(acc)
            train_loss_ls.append(loss)
            train_dist_ls.append(dist)
            train_auc_ls.append(auc)
            train_pr_ls.append(pr)
            train_acc = acc if acc > train_acc else train_acc
            train_loss = loss if loss < train_loss else train_loss
            train_dist = dist if dist < train_dist else train_dist
            train_auc = auc if auc > train_auc else train_auc
            train_PR = pr if pr > train_PR else train_PR

            log.write(
                'Train:{idx:d} epoch | loss:{loss:.8f} | acc:{acc:.4f} | dist:{dist:.4f} | AUC: {AUC:.4f} | PR: {PR:.4f}\n'
                .format(
                    idx=i,
                    loss=loss,
                    acc=acc,
                    dist=dist,
                    AUC=auc,
                    PR=pr,
                ))

            opt.acc = acc
            opt.loss = loss
            m_dev = m.module

            loss, acc, dist, auc, pr, pt_acc, pt_dist, pt_auc, pt_pr = valid(
                val_loader, m, criterion, writer)
            train_log_tmp.insert(9, loss)
            train_log_tmp.insert(10, acc.tolist())
            train_log_tmp.insert(11, dist.tolist())
            train_log_tmp.insert(12, auc)
            train_log_tmp.insert(13, pr)
            train_log_tmp.insert(14, " ")
            for a in pt_acc:
                train_log_tmp.append(a.tolist())
            train_log_tmp.append(" ")
            for d in pt_dist:
                train_log_tmp.append(d.tolist())
            train_log_tmp.append(" ")
            for ac in pt_auc:
                train_log_tmp.append(ac)
            train_log_tmp.append(" ")
            for p in pt_pr:
                train_log_tmp.append(p)
            train_log_tmp.append(" ")

            val_acc_ls.append(acc)
            val_loss_ls.append(loss)
            val_dist_ls.append(dist)
            val_auc_ls.append(auc)
            val_pr_ls.append(pr)
            if acc > val_acc:
                best_epoch = i
                val_acc = acc
                torch.save(
                    m_dev.state_dict(),
                    'exp/{0}/{1}/{1}_best_acc.pkl'.format(folder, save_ID))
                m_best = copy.deepcopy(m)
            val_loss = loss if loss < val_loss else val_loss
            if dist < val_dist:
                val_dist = dist
                torch.save(
                    m_dev.state_dict(),
                    'exp/{0}/{1}/{1}_best_dist.pkl'.format(folder, save_ID))
            if auc > val_auc:
                val_auc = auc
                torch.save(
                    m_dev.state_dict(),
                    'exp/{0}/{1}/{1}_best_auc.pkl'.format(folder, save_ID))
            if pr > val_PR:
                val_PR = pr
                torch.save(
                    m_dev.state_dict(),
                    'exp/{0}/{1}/{1}_best_pr.pkl'.format(folder, save_ID))

            log.write(
                'Valid:{idx:d} epoch | loss:{loss:.8f} | acc:{acc:.4f} | dist:{dist:.4f} | AUC: {AUC:.4f} | PR: {PR:.4f}\n'
                .format(
                    idx=i,
                    loss=loss,
                    acc=acc,
                    dist=dist,
                    AUC=auc,
                    PR=pr,
                ))

            bn_sum, bn_num = 0, 0
            for mod in m.modules():
                if isinstance(mod, nn.BatchNorm2d):
                    bn_num += mod.num_features
                    bn_sum += torch.sum(abs(mod.weight))
                    writer.add_histogram("bn_weight",
                                         mod.weight.data.cpu().numpy(), i)

            bn_ave = bn_sum / bn_num
            bn_log.write("{} --> {}".format(i, bn_ave))
            print("Current bn : {} --> {}".format(i, bn_ave))
            bn_log.write("\n")
            log.close()
            csv_writer.writerow(train_log_tmp)

            writer.add_scalar("lr", lr, i)
            print("epoch {}: lr {}".format(i, lr))
            lr_ls.append(lr)

            torch.save(opt, 'exp/{}/{}/option.pkl'.format(folder, save_ID, i))
            if i % opt.save_interval == 0 and i != 0:
                torch.save(
                    m_dev.state_dict(),
                    'exp/{0}/{1}/{1}_{2}.pkl'.format(folder, save_ID, i))
                # torch.save(
                #     optimizer, 'exp/{}/{}/optimizer.pkl'.format(dataset, save_folder))

            if i < warm_up_epoch:
                optimizer, lr = warm_up_lr(optimizer, i)
            elif i == warm_up_epoch:
                lr = opt.LR
                early_stopping(acc)
            else:
                early_stopping(acc)
                if early_stopping.early_stop:
                    optimizer, lr = lr_decay(optimizer, lr)
                    decay += 1
                    # if decay == 2:
                    #     draw_pred_img = False
                    if decay > opt.lr_decay_time:
                        stop = True
                    else:
                        decay_epoch.append(i)
                        early_stopping.reset(
                            int(opt.patience * patience_decay[decay]))
                        # torch.save(m_dev.state_dict(), 'exp/{0}/{1}/{1}_decay{2}.pkl'.format(folder, save_ID, decay))
                        m = m_best

            for epo, ac in config.bad_epochs.items():
                if i == epo and val_acc < ac:
                    stop = True
            if stop:
                print("Training finished at epoch {}".format(i))
                break

        training_time = time.time() - begin_time
        writer.close()
        train_log.close()

        # draw_graph(epoch_ls, train_loss_ls, val_loss_ls, train_acc_ls, val_acc_ls, train_dist_ls, val_dist_ls, log_dir)
        draw_graph(epoch_ls, train_loss_ls, val_loss_ls, "loss", log_dir)
        draw_graph(epoch_ls, train_acc_ls, val_acc_ls, "acc", log_dir)
        draw_graph(epoch_ls, train_auc_ls, val_auc_ls, "AUC", log_dir)
        draw_graph(epoch_ls, train_dist_ls, val_dist_ls, "dist", log_dir)
        draw_graph(epoch_ls, train_pr_ls, val_pr_ls, "PR", log_dir)

        with open(result, "a+") as f:
            if not exist:
                title_str = "id,backbone,structure,DUC,params,flops,time,loss_param,addDPG,kps,batch_size,optimizer," \
                            "freeze_bn,freeze,sparse,sparse_decay,epoch_num,LR,Gaussian,thresh,weightDecay,loadModel," \
                            "model_location, ,folder_name,training_time,train_acc,train_loss,train_dist,train_AUC," \
                            "train_PR,val_acc,val_loss,val_dist,val_AUC,val_PR,best_epoch,final_epoch"
                title_str = write_decay_title(len(decay_epoch), title_str)
                f.write(title_str)
            info_str = "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}, ,{},{},{},{},{},{},{},{},{},{},{},{},{},{}\n".\
                format(save_ID, opt.backbone, opt.struct, opt.DUC, params, flops, inf_time, opt.loss_allocate, opt.addDPG,
                       opt.kps, opt.trainBatch, opt.optMethod, opt.freeze_bn, opt.freeze, opt.sparse_s, opt.sparse_decay,
                       opt.nEpochs, opt.LR, opt.hmGauss, opt.ratio, opt.weightDecay, opt.loadModel, config.computer,
                       os.path.join(folder, save_ID), training_time, train_acc, train_loss, train_dist, train_auc,
                       train_PR, val_acc, val_loss, val_dist, val_auc, val_PR, best_epoch, i)
            info_str = write_decay_info(decay_epoch, info_str)
            f.write(info_str)
    # except IOError:
    #     with open(result, "a+") as f:
    #         training_time = time.time() - begin_time
    #         writer.close()
    #         train_log.close()
    #         info_str = "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}, ,{},{},{}\n". \
    #             format(save_ID, opt.backbone, opt.struct, opt.DUC, params, flops, inf_time, opt.loss_allocate, opt.addDPG,
    #                    opt.kps, opt.trainBatch, opt.optMethod, opt.freeze_bn, opt.freeze, opt.sparse_s, opt.sparse_decay,
    #                    opt.nEpochs, opt.LR, opt.hmGauss, opt.ratio, opt.weightDecay, opt.loadModel, config.computer,
    #                    os.path.join(folder, save_ID), training_time, "Some file is closed")
    #         f.write(info_str)
    except ZeroDivisionError:
        with open(result, "a+") as f:
            training_time = time.time() - begin_time
            writer.close()
            train_log.close()
            info_str = "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}, ,{},{},{}\n". \
                format(save_ID, opt.backbone, opt.struct, opt.DUC, params, flops, inf_time, opt.loss_allocate, opt.addDPG,
                       opt.kps, opt.trainBatch, opt.optMethod, opt.freeze_bn, opt.freeze, opt.sparse_s, opt.sparse_decay,
                       opt.nEpochs, opt.LR, opt.hmGauss, opt.ratio, opt.weightDecay, opt.loadModel, config.computer,
                       os.path.join(folder, save_ID), training_time, "Gradient flow")
            f.write(info_str)
    except KeyboardInterrupt:
        with open(result, "a+") as f:
            training_time = time.time() - begin_time
            writer.close()
            train_log.close()
            info_str = "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}, ,{},{},{}\n". \
                format(save_ID, opt.backbone, opt.struct, opt.DUC, params, flops, inf_time, opt.loss_allocate, opt.addDPG,
                       opt.kps, opt.trainBatch, opt.optMethod, opt.freeze_bn, opt.freeze, opt.sparse_s, opt.sparse_decay,
                       opt.nEpochs, opt.LR, opt.hmGauss, opt.ratio, opt.weightDecay, opt.loadModel, config.computer,
                       os.path.join(folder, save_ID), training_time, "Be killed by someone")
            f.write(info_str)

    print("Model {} training finished".format(save_ID))
    print(
        "----------------------------------------------------------------------------------------------------"
    )
Ejemplo n.º 19
0
def run_model_IMDB(feats_type, num_layers, hidden_dim, num_heads, attn_vec_dim,
                   rnn_type, num_epochs, patience, repeat, save_postfix):
    nx_G_lists, edge_metapath_indices_lists, features_list, adjM, type_mask, labels, train_val_test_idx = load_IMDB_data(
    )
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    features_list = [
        torch.FloatTensor(features.todense()).to(device)
        for features in features_list
    ]
    if feats_type == 0:
        in_dims = [features.shape[1] for features in features_list]
    elif feats_type == 1:
        in_dims = [features_list[0].shape[1]] + [10] * (len(features_list) - 1)
        for i in range(1, len(features_list)):
            features_list[i] = torch.zeros(
                (features_list[i].shape[0], 10)).to(device)
    elif feats_type == 2:
        in_dims = [features.shape[0] for features in features_list]
        in_dims[0] = features_list[0].shape[1]
        for i in range(1, len(features_list)):
            dim = features_list[i].shape[0]
            indices = np.vstack((np.arange(dim), np.arange(dim)))
            indices = torch.LongTensor(indices)
            values = torch.FloatTensor(np.ones(dim))
            features_list[i] = torch.sparse.FloatTensor(
                indices, values, torch.Size([dim, dim])).to(device)
    elif feats_type == 3:
        in_dims = [features.shape[0] for features in features_list]
        for i in range(len(features_list)):
            dim = features_list[i].shape[0]
            indices = np.vstack((np.arange(dim), np.arange(dim)))
            indices = torch.LongTensor(indices)
            values = torch.FloatTensor(np.ones(dim))
            features_list[i] = torch.sparse.FloatTensor(
                indices, values, torch.Size([dim, dim])).to(device)
    edge_metapath_indices_lists = [[
        torch.LongTensor(indices).to(device) for indices in indices_list
    ] for indices_list in edge_metapath_indices_lists]
    labels = torch.LongTensor(labels).to(device)
    g_lists = []
    for nx_G_list in nx_G_lists:
        g_lists.append([])
        for nx_G in nx_G_list:
            g = dgl.DGLGraph(multigraph=True)
            g.add_nodes(nx_G.number_of_nodes())
            g.add_edges(*list(
                zip(*sorted(
                    map(lambda tup: (int(tup[0]), int(tup[1])),
                        nx_G.edges())))))
            g_lists[-1].append(g)
    train_idx = train_val_test_idx['train_idx']
    val_idx = train_val_test_idx['val_idx']
    test_idx = train_val_test_idx['test_idx']

    svm_macro_f1_lists = []
    svm_micro_f1_lists = []
    nmi_mean_list = []
    nmi_std_list = []
    ari_mean_list = []
    ari_std_list = []
    for _ in range(repeat):
        net = MAGNN_nc(num_layers, [2, 2, 2], 4, etypes_lists, in_dims,
                       hidden_dim, out_dim, num_heads, attn_vec_dim, rnn_type,
                       dropout_rate)
        net.to(device)
        optimizer = torch.optim.Adam(net.parameters(),
                                     lr=lr,
                                     weight_decay=weight_decay)
        target_node_indices = np.where(type_mask == 0)[0]

        # training loop
        net.train()
        early_stopping = EarlyStopping(
            patience=patience,
            verbose=True,
            save_path='checkpoint/checkpoint_{}.pt'.format(save_postfix))
        dur1 = []
        dur2 = []
        dur3 = []
        for epoch in range(num_epochs):
            t0 = time.time()

            # training forward
            net.train()
            logits, embeddings = net((g_lists, features_list, type_mask,
                                      edge_metapath_indices_lists),
                                     target_node_indices)
            logp = F.log_softmax(logits, 1)
            train_loss = F.nll_loss(logp[train_idx], labels[train_idx])

            t1 = time.time()
            dur1.append(t1 - t0)

            # autograd
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()

            t2 = time.time()
            dur2.append(t2 - t1)

            # validation forward
            net.eval()
            with torch.no_grad():
                logits, embeddings = net((g_lists, features_list, type_mask,
                                          edge_metapath_indices_lists),
                                         target_node_indices)
                logp = F.log_softmax(logits, 1)
                val_loss = F.nll_loss(logp[val_idx], labels[val_idx])

            t3 = time.time()
            dur3.append(t3 - t2)

            # print info
            print(
                "Epoch {:05d} | Train_Loss {:.4f} | Val_Loss {:.4f} | Time1(s) {:.4f} | Time2(s) {:.4f} | Time3(s) {:.4f}"
                .format(epoch, train_loss.item(), val_loss.item(),
                        np.mean(dur1), np.mean(dur2), np.mean(dur3)))
            # early stopping
            early_stopping(val_loss, net)
            if early_stopping.early_stop:
                print('Early stopping!')
                break

        # testing with evaluate_results_nc
        net.load_state_dict(
            torch.load('checkpoint/checkpoint_{}.pt'.format(save_postfix)))
        net.eval()
        with torch.no_grad():
            logits, embeddings = net((g_lists, features_list, type_mask,
                                      edge_metapath_indices_lists),
                                     target_node_indices)
            svm_macro_f1_list, svm_micro_f1_list, nmi_mean, nmi_std, ari_mean, ari_std = evaluate_results_nc(
                embeddings[test_idx].cpu().numpy(),
                labels[test_idx].cpu().numpy(),
                num_classes=out_dim)
        svm_macro_f1_lists.append(svm_macro_f1_list)
        svm_micro_f1_lists.append(svm_micro_f1_list)
        nmi_mean_list.append(nmi_mean)
        nmi_std_list.append(nmi_std)
        ari_mean_list.append(ari_mean)
        ari_std_list.append(ari_std)

    # print out a summary of the evaluations
    svm_macro_f1_lists = np.transpose(np.array(svm_macro_f1_lists), (1, 0, 2))
    svm_micro_f1_lists = np.transpose(np.array(svm_micro_f1_lists), (1, 0, 2))
    nmi_mean_list = np.array(nmi_mean_list)
    nmi_std_list = np.array(nmi_std_list)
    ari_mean_list = np.array(ari_mean_list)
    ari_std_list = np.array(ari_std_list)
    print('----------------------------------------------------------------')
    print('SVM tests summary')
    print('Macro-F1: ' + ', '.join([
        '{:.6f}~{:.6f} ({:.1f})'.format(macro_f1[:, 0].mean(),
                                        macro_f1[:, 1].mean(), train_size) for
        macro_f1, train_size in zip(svm_macro_f1_lists, [0.8, 0.6, 0.4, 0.2])
    ]))
    print('Micro-F1: ' + ', '.join([
        '{:.6f}~{:.6f} ({:.1f})'.format(micro_f1[:, 0].mean(),
                                        micro_f1[:, 1].mean(), train_size) for
        micro_f1, train_size in zip(svm_micro_f1_lists, [0.8, 0.6, 0.4, 0.2])
    ]))
    print('K-means tests summary')
    print('NMI: {:.6f}~{:.6f}'.format(nmi_mean_list.mean(),
                                      nmi_std_list.mean()))
    print('ARI: {:.6f}~{:.6f}'.format(ari_mean_list.mean(),
                                      ari_std_list.mean()))
Ejemplo n.º 20
0
def main():
    # Training settings
    # Note: Hyper-parameters need to be tuned in order to obtain results reported in the paper.
    parser = argparse.ArgumentParser(
        description=
        'Implementation of COMPACT GRAPH ARCHITECTURE FOR SPEECH EMOTION RECOGNITION paper'
    )
    parser.add_argument('--dataset',
                        type=str,
                        default="IEMOCAP",
                        help='name of dataset (default: IEMOCAP)')
    parser.add_argument('--device',
                        type=int,
                        default=0,
                        help='which gpu to use if any (default: 0)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=128,
                        help='input batch size for training (default: 32)')
    parser.add_argument(
        '--iters_per_epoch',
        type=int,
        default=50,
        help='number of iterations per each epoch (default: 90)')
    parser.add_argument('--epochs',
                        type=int,
                        default=1000,
                        help='number of epochs to train (default: 1000)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.0005,
                        help='learning rate (default: 0.01)')
    parser.add_argument(
        '--seed',
        type=int,
        default=0,
        help='random seed for splitting the dataset into 10 (default: 0)')
    parser.add_argument(
        '--fold_idx',
        type=int,
        default=5,
        help='the index of fold in 10-fold validation. Should be less then 10.'
    )
    parser.add_argument(
        '--num_layers',
        type=int,
        default=2,
        help='number of layers INCLUDING the input one (default: 5)')
    parser.add_argument('--hidden_dim',
                        type=int,
                        default=64,
                        help='number of hidden units (default: 64)')
    parser.add_argument('--final_dropout',
                        type=float,
                        default=0.5,
                        help='final layer dropout (default: 0.5)')
    parser.add_argument(
        '--graph_pooling_type',
        type=str,
        default="sum",
        choices=["sum", "average"],
        help=
        'Pooling over nodes in a graph to get graph embeddig: sum or average')
    parser.add_argument('--graph_type',
                        type=str,
                        default="line",
                        choices=["line", "cycle"],
                        help='Graph construction options')
    parser.add_argument('--Normalize',
                        type=bool,
                        default=True,
                        choices=[True, False],
                        help='Normalizing data')
    parser.add_argument('--patience',
                        type=int,
                        default=10,
                        help='Normalizing data')
    parser.add_argument('--beta1',
                        default=0.9,
                        type=float,
                        help='beta1 for adam')
    parser.add_argument('--beta2',
                        default=0.999,
                        type=float,
                        help='beta2 for adam')
    parser.add_argument('--weight-decay',
                        '--wd',
                        default=1e-4,
                        type=float,
                        metavar='W',
                        help='weight decay (default: 1e-4)')
    args = parser.parse_args()

    #set up seeds and gpu device
    torch.manual_seed(0)
    np.random.seed(0)
    device = torch.device(
        "cuda:" +
        str(args.device)) if torch.cuda.is_available() else torch.device("cpu")
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(0)

    ##load data
    graphs, num_classes = load_data(args.dataset, args.Normalize)

    ##10-fold cross validation. Conduct an experiment on the fold specified by args.fold_idx.
    train_graphs, test_graphs = separate_data(graphs, args.seed, args.fold_idx)

    A = nx.to_numpy_matrix(train_graphs[0][0].g)
    if (args.graph_type == 'cycle'):
        A[0, -1] = 1
        A[-1, 0] = 1
    A = torch.Tensor(A).to(device)

    model = Graph_CNN_ortega(args.num_layers,
                             train_graphs[0][0].node_features.shape[1],
                             args.hidden_dim, num_classes, args.final_dropout,
                             args.graph_pooling_type, device, A).to(device)

    Num_Param = sum(p.numel() for p in model.parameters() if p.requires_grad)

    b = 0
    for p in model.parameters():
        if p.requires_grad:
            a = p.numel()
            b += a
    print("Number of Trainable Parameters= %d" % (Num_Param))

    acc_train_sum = 0
    acc_test_sum = 0

    for i in range(args.fold_idx):
        train_data = train_graphs[i]
        test_data = test_graphs[i]

        # optimizer = RAdam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2),
        #                   weight_decay=args.weight_decay)
        optimizer = optim.Adam(model.parameters(), lr=args.lr)
        # optimizer = AdamW(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2),
        #                   weight_decay=args.weight_decay)
        scheduler = optim.lr_scheduler.StepLR(optimizer,
                                              step_size=50,
                                              gamma=0.5)

        early_stopping = EarlyStopping(patience=args.patience, verbose=True)

        for epoch in range(1, args.epochs + 1):
            scheduler.step()

            avg_loss = train(args, model, device, train_data, optimizer, epoch,
                             A)

            if (epoch > 1):
                #### Validation check
                with torch.no_grad():
                    val_out = pass_data_iteratively(model, test_data)
                    val_labels = torch.LongTensor(
                        [graph.label for graph in test_data]).to(device)
                    val_loss = criterion(val_out, val_labels)
                    val_loss = np.average(val_loss.detach().cpu().numpy())

                #### Check early stopping
                early_stopping(val_loss, model)

                if early_stopping.early_stop:
                    print("Early stopping")
                    break

            if ((epoch > 300) and (epoch % 20 == 0)) or (epoch % 10 == 0):
                acc_train, acc_test, _, _ = test(args, model, device,
                                                 train_data, test_data,
                                                 num_classes)

        model.load_state_dict(torch.load('checkpoint.pt'))

        acc_train, acc_test, output, label = test(args, model, device,
                                                  train_data, test_data,
                                                  num_classes)
        acc_train_sum += acc_train
        acc_test_sum += acc_test

        model = Graph_CNN_ortega(args.num_layers,
                                 train_graphs[0][0].node_features.shape[1],
                                 args.hidden_dim, num_classes,
                                 args.final_dropout, args.graph_pooling_type,
                                 device, A).to(device)

    print('Average train acc: %f,  Average test acc: %f' %
          (acc_train_sum / args.fold_idx, acc_test_sum / args.fold_idx))
Ejemplo n.º 21
0
    def __init__(self, input_size, n_channels, hparams, gpu, inference=False):

        self.hparams = hparams

        if inference:
            self.device = torch.device('cpu')
            self.model = ECGNet(n_channels=n_channels,
                                hparams=self.hparams).to(self.device)
        else:
            if torch.cuda.device_count() > 1:
                if len(gpu) > 0:
                    print("Number of GPUs will be used: ", len(gpu))
                    self.device = torch.device(f"cuda:{gpu[0]}" if torch.cuda.
                                               is_available() else "cpu")
                    self.model = ECGNet(n_channels=n_channels,
                                        hparams=self.hparams).to(self.device)
                    self.model = DP(self.model,
                                    device_ids=gpu,
                                    output_device=gpu[0])
                else:
                    print("Number of GPUs will be used: ",
                          torch.cuda.device_count() - 5)
                    self.device = torch.device(
                        "cuda:0" if torch.cuda.is_available() else "cpu")
                    self.model = ECGNet(n_channels=n_channels,
                                        hparams=self.hparams).to(self.device)
                    self.model = DP(self.model,
                                    device_ids=list(
                                        range(torch.cuda.device_count() - 5)))
            else:
                self.device = torch.device(
                    "cuda:0" if torch.cuda.is_available() else "cpu")
                self.model = ECGNet(n_channels=n_channels,
                                    hparams=self.hparams).to(self.device)
                print('Only one GPU is available')

        # define the models
        #summary(self.model, (input_size, n_channels))
        #print(torch.cuda.is_available())

        self.metric = Metric()
        self.num_workers = 18
        self.threshold = 0.5

        ########################## compile the model ###############################

        # define optimizer
        self.optimizer = torch.optim.Adam(params=self.model.parameters(),
                                          lr=self.hparams['lr'])

        weights = torch.Tensor([
            1., 1., 1., 1., 0.5, 1., 1., 1., 1., 1., 1., 1., 0.5, 0.5, 1., 1.,
            1., 1., 0.5, 1., 1., 1., 1., 0.5, 1., 1., 0.5
        ]).to(self.device)

        self.loss = nn.BCELoss(weight=weights)  # CompLoss(self.device) #
        self.decoder_loss = nn.MSELoss()

        # define early stopping
        self.early_stopping = EarlyStopping(
            checkpoint_path=self.hparams['checkpoint_path'] + '/checkpoint' +
            str(self.hparams['start_fold']) + '.pt',
            patience=self.hparams['patience'],
            delta=self.hparams['min_delta'],
            is_maximize=True,
        )
        # lr cheduler
        self.scheduler = ReduceLROnPlateau(
            optimizer=self.optimizer,
            mode='max',
            factor=0.2,
            patience=1,
            verbose=True,
            threshold=self.hparams['min_delta'],
            threshold_mode='abs',
            cooldown=0,
            eps=0,
        )

        self.seed_everything(42)

        self.postprocessing = PostProcessing(fold=self.hparams['start_fold'])
        self.scaler = torch.cuda.amp.GradScaler()
Ejemplo n.º 22
0
def run_model_DBLP(feats_type, hidden_dim, num_heads, attn_vec_dim, rnn_type,
                   num_epochs, patience, batch_size, neighbor_samples, repeat,
                   save_postfix):
    adjlists, edge_metapath_indices_list, features_list, adjM, type_mask, labels, train_val_test_idx = load_DBLP_data(
    )
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    features_list = [
        torch.FloatTensor(features).to(device) for features in features_list
    ]
    if feats_type == 0:
        in_dims = [features.shape[1] for features in features_list]
    elif feats_type == 1:
        in_dims = [features_list[0].shape[1]] + [10] * (len(features_list) - 1)
        for i in range(1, len(features_list)):
            features_list[i] = torch.zeros(
                (features_list[i].shape[0], 10)).to(device)
    elif feats_type == 2:
        in_dims = [features.shape[0] for features in features_list]
        in_dims[0] = features_list[0].shape[1]
        for i in range(1, len(features_list)):
            dim = features_list[i].shape[0]
            indices = np.vstack((np.arange(dim), np.arange(dim)))
            indices = torch.LongTensor(indices)
            values = torch.FloatTensor(np.ones(dim))
            features_list[i] = torch.sparse.FloatTensor(
                indices, values, torch.Size([dim, dim])).to(device)
    elif feats_type == 3:
        in_dims = [features.shape[0] for features in features_list]
        for i in range(len(features_list)):
            dim = features_list[i].shape[0]
            indices = np.vstack((np.arange(dim), np.arange(dim)))
            indices = torch.LongTensor(indices)
            values = torch.FloatTensor(np.ones(dim))
            features_list[i] = torch.sparse.FloatTensor(
                indices, values, torch.Size([dim, dim])).to(device)
    labels = torch.LongTensor(labels).to(device)
    train_idx = train_val_test_idx['train_idx']
    train_idx = np.sort(train_idx)
    val_idx = train_val_test_idx['val_idx']
    val_idx = np.sort(val_idx)
    test_idx = train_val_test_idx['test_idx']
    test_idx = np.sort(test_idx)

    svm_macro_f1_lists = []
    svm_micro_f1_lists = []
    nmi_mean_list = []
    nmi_std_list = []
    ari_mean_list = []
    ari_std_list = []
    for _ in range(repeat):
        net = MAGNN_nc_mb(3, 6, etypes_list, in_dims, hidden_dim, out_dim,
                          num_heads, attn_vec_dim, rnn_type, dropout_rate)
        net.to(device)
        optimizer = torch.optim.Adam(net.parameters(),
                                     lr=lr,
                                     weight_decay=weight_decay)

        # training loop
        net.train()
        early_stopping = EarlyStopping(
            patience=patience,
            verbose=True,
            save_path='checkpoint/checkpoint_{}.pt'.format(save_postfix))
        dur1 = []
        dur2 = []
        dur3 = []
        train_idx_generator = index_generator(batch_size=batch_size,
                                              indices=train_idx)
        val_idx_generator = index_generator(batch_size=batch_size,
                                            indices=val_idx,
                                            shuffle=False)
        for epoch in range(num_epochs):
            t_start = time.time()
            # training
            net.train()
            for iteration in range(train_idx_generator.num_iterations()):
                # forward
                t0 = time.time()

                train_idx_batch = train_idx_generator.next()
                train_idx_batch.sort()
                train_g_list, train_indices_list, train_idx_batch_mapped_list = parse_minibatch(
                    adjlists, edge_metapath_indices_list, train_idx_batch,
                    device, neighbor_samples)

                t1 = time.time()
                dur1.append(t1 - t0)

                logits, embeddings = net(
                    (train_g_list, features_list, type_mask,
                     train_indices_list, train_idx_batch_mapped_list))
                logp = F.log_softmax(logits, 1)
                train_loss = F.nll_loss(logp, labels[train_idx_batch])

                t2 = time.time()
                dur2.append(t2 - t1)

                # autograd
                optimizer.zero_grad()
                train_loss.backward()
                optimizer.step()

                t3 = time.time()
                dur3.append(t3 - t2)

                # print training info
                if iteration % 50 == 0:
                    print(
                        'Epoch {:05d} | Iteration {:05d} | Train_Loss {:.4f} | Time1(s) {:.4f} | Time2(s) {:.4f} | Time3(s) {:.4f}'
                        .format(epoch, iteration, train_loss.item(),
                                np.mean(dur1), np.mean(dur2), np.mean(dur3)))

            # validation
            net.eval()
            val_logp = []
            with torch.no_grad():
                for iteration in range(val_idx_generator.num_iterations()):
                    # forward
                    val_idx_batch = val_idx_generator.next()
                    val_g_list, val_indices_list, val_idx_batch_mapped_list = parse_minibatch(
                        adjlists, edge_metapath_indices_list, val_idx_batch,
                        device, neighbor_samples)
                    logits, embeddings = net(
                        (val_g_list, features_list, type_mask,
                         val_indices_list, val_idx_batch_mapped_list))
                    logp = F.log_softmax(logits, 1)
                    val_logp.append(logp)
                val_loss = F.nll_loss(torch.cat(val_logp, 0), labels[val_idx])
            t_end = time.time()
            # print validation info
            print('Epoch {:05d} | Val_Loss {:.4f} | Time(s) {:.4f}'.format(
                epoch, val_loss.item(), t_end - t_start))
            # early stopping
            early_stopping(val_loss, net)
            if early_stopping.early_stop:
                print('Early stopping!')
                break

        # testing with evaluate_results_nc
        test_idx_generator = index_generator(batch_size=batch_size,
                                             indices=test_idx,
                                             shuffle=False)
        net.load_state_dict(
            torch.load('checkpoint/checkpoint_{}.pt'.format(save_postfix)))
        net.eval()
        test_embeddings = []
        with torch.no_grad():
            for iteration in range(test_idx_generator.num_iterations()):
                # forward
                test_idx_batch = test_idx_generator.next()
                test_g_list, test_indices_list, test_idx_batch_mapped_list = parse_minibatch(
                    adjlists, edge_metapath_indices_list, test_idx_batch,
                    device, neighbor_samples)
                logits, embeddings = net(
                    (test_g_list, features_list, type_mask, test_indices_list,
                     test_idx_batch_mapped_list))
                test_embeddings.append(embeddings)
            test_embeddings = torch.cat(test_embeddings, 0)
            svm_macro_f1_list, svm_micro_f1_list, nmi_mean, nmi_std, ari_mean, ari_std = evaluate_results_nc(
                test_embeddings.cpu().numpy(),
                labels[test_idx].cpu().numpy(),
                num_classes=out_dim)
        svm_macro_f1_lists.append(svm_macro_f1_list)
        svm_micro_f1_lists.append(svm_micro_f1_list)
        nmi_mean_list.append(nmi_mean)
        nmi_std_list.append(nmi_std)
        ari_mean_list.append(ari_mean)
        ari_std_list.append(ari_std)

    # print out a summary of the evaluations
    svm_macro_f1_lists = np.transpose(np.array(svm_macro_f1_lists), (1, 0, 2))
    svm_micro_f1_lists = np.transpose(np.array(svm_micro_f1_lists), (1, 0, 2))
    nmi_mean_list = np.array(nmi_mean_list)
    nmi_std_list = np.array(nmi_std_list)
    ari_mean_list = np.array(ari_mean_list)
    ari_std_list = np.array(ari_std_list)
    print('----------------------------------------------------------------')
    print('SVM tests summary')
    print('Macro-F1: ' + ', '.join([
        '{:.6f}~{:.6f} ({:.1f})'.format(macro_f1[:, 0].mean(),
                                        macro_f1[:, 1].mean(), train_size) for
        macro_f1, train_size in zip(svm_macro_f1_lists, [0.8, 0.6, 0.4, 0.2])
    ]))
    print('Micro-F1: ' + ', '.join([
        '{:.6f}~{:.6f} ({:.1f})'.format(micro_f1[:, 0].mean(),
                                        micro_f1[:, 1].mean(), train_size) for
        micro_f1, train_size in zip(svm_micro_f1_lists, [0.8, 0.6, 0.4, 0.2])
    ]))
    print('K-means tests summary')
    print('NMI: {:.6f}~{:.6f}'.format(nmi_mean_list.mean(),
                                      nmi_std_list.mean()))
    print('ARI: {:.6f}~{:.6f}'.format(ari_mean_list.mean(),
                                      ari_std_list.mean()))
Ejemplo n.º 23
0
#Iterative optimization routine, can choose other techniques (e.g., Adam)
optimizer_conv = optim.SGD(filter(lambda p: p.requires_grad,
                                  model_conv.parameters()),
                           lr=lr,
                           momentum=momentum)  #Here to switch weights

#Learning rate decay scheduler
exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer_conv,
                                                  'min',
                                                  factor=gamma,
                                                  verbose=True,
                                                  patience=8)

#Try to reduce over fitting by using early stopping
early_stopping = EarlyStopping()


##Note the following function is adapted from pytorch tutorial
# https://github.com/pytorch/tutorials/blob/master/beginner_source/transfer_learning_tutorial.py
def train_model(model, criterion, optimizer, scheduler, num_epochs):
    since = time.time()
    #List to save out and watch learning convergence
    val_loss = []
    val_acc = []

    train_loss = []
    train_acc = []

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
Ejemplo n.º 24
0
def train():
    # initialization
    device = t.device("cuda:0" if t.cuda.is_available() else "cpu")
    opt = DefaultConfig()
    train_losses, valid_losses, avg_train_losses, avg_valid_losses = [], [], [], []
    writer = SummaryWriter('logs')
    criterion = nn.CrossEntropyLoss().to(device)
    config_data = [['Key', 'Value'], ['device', device]]

    # config
    config_generator(config_data, opt)

    # data
    train_data, train_dataloader, val_data, val_dataloader = data_generator(
        opt)

    # model
    model = model_generator(device, opt)

    # optimizer & lr_scheduler & early_stopping
    optimizer = Adam(model.fc.parameters(),
                     lr=opt.lr,
                     weight_decay=opt.weight_decay)
    early_stopping = EarlyStopping(patience=5,
                                   verbose=False,
                                   path='checkpoints/%s_final_checkpoint.pth' %
                                   opt.model)

    print('Starting training on %d images:' % len(train_data))

    # Train with frozen layers first, to get a stable loss.
    # Adjust num epochs to your dataset. This step is enough to obtain a not bad model.
    if opt.freeze:
        for epoch in range(opt.freeze_epoch):
            print('Epoch {}/{} :'.format(epoch, opt.freeze_epoch +
                                         opt.unfreeze_epoch))
            model.train()

            train_loss, loss_meter, correct = 0, 0, 0

            # train epoch
            for batch_i, (data, label, _) in enumerate(tqdm(train_dataloader)):
                input = Variable(data).to(device)
                target = Variable(label).to(device)
                optimizer.zero_grad()
                predict = model(input)
                loss = criterion(predict, target)
                loss.backward()
                optimizer.step()
                train_losses.append(loss.item())
                loss_meter += loss.item()
                logits = t.relu(predict)
                pred = logits.data.max(1)[1]
                correct += pred.eq(target.data).sum()
            train_acc = correct.cpu().detach().numpy() * 1.0 / len(
                train_dataloader.dataset)
            # ending of train epoch

            # validation epoch
            if epoch % opt.evaluation_interval == 0:
                if t.cuda.device_count() > 1:
                    model = nn.DataParallel(model, device_ids=[0])
                model.eval()
                loss_meter, correct = 0, 0
                with t.no_grad():
                    print('Validating on %d images:' % len(val_data))
                    for inputs, target, _ in tqdm(val_dataloader):
                        inputs = inputs.to(device)
                        target = target.to(device)
                        output = model(inputs)
                        loss = criterion(output, target)
                        loss_meter += loss.item()
                        valid_losses.append(loss.item())
                        logits = t.relu(output)
                        pred = logits.data.max(1)[1]
                        correct += pred.eq(target.data).sum()
                    val_acc = correct.cpu().detach().numpy() * 1.0 / len(
                        val_dataloader.dataset)
            # ending of validation epoch

            train_loss = np.average(train_losses)
            valid_loss = np.average(valid_losses)
            avg_train_losses.append(train_loss)
            avg_valid_losses.append(valid_loss)

            print('train_loss: %.3f, train_acc: %.3f' %
                  (train_loss, train_acc))
            print('val_loss: %.3f, val_acc: %.3f' % (valid_loss, val_acc))
            writer.add_scalar('train_loss', train_loss, global_step=epoch)
            writer.add_scalar('train_acc', train_acc, global_step=epoch)
            writer.add_scalar('valid_loss', valid_loss, global_step=epoch)
            writer.add_scalar('val_acc', val_acc, global_step=epoch)

            # clear lists to track next epoch
            train_losses.clear()
            valid_losses.clear()

            # early_stopping needs the validation loss to check if it has decresed,
            # and if it has, it will make a checkpoint of the current model
            early_stopping(valid_loss, model)

            if early_stopping.early_stop:
                print("Early stopping")
                opt.unfreeze = False
                break

            if epoch % opt.checkpoint_interval == 0:
                t.save(model.state_dict(),
                       'checkpoints/' + '%s_ckpt_%d.pth' % (opt.model, epoch))

            print_separator()

            # load the last checkpoint with the best model
            model.load_state_dict(
                t.load('checkpoints/' + '%s_final_checkpoint.pth' % opt.model))

    # Unfreeze and continue training, to fine-tune.
    # Train longer if the result is not good.
    if opt.unfreeze:
        print('Unfreeze all layers:')
        for param in model.parameters():
            param.requires_grad = True

        optimizer = Adam(model.parameters(),
                         lr=opt.lr,
                         weight_decay=opt.weight_decay)
        lr_scheduler = ReduceLROnPlateau(optimizer,
                                         'min',
                                         factor=opt.lr_decay,
                                         patience=3,
                                         verbose=True)

        for epoch in range(opt.freeze_epoch, opt.unfreeze_epoch):
            print('Epoch {}/{} :'.format(epoch, opt.unfreeze_epoch +
                                         opt.freeze_epoch))
            model.train()

            train_loss, loss_meter, correct = 0, 0, 0

            # train epoch
            for batch_i, (data, label, _) in enumerate(tqdm(train_dataloader)):
                input = Variable(data).to(device)
                target = Variable(label).to(device)
                optimizer.zero_grad()
                predict = model(input)
                loss = criterion(predict, target)
                loss.backward()
                optimizer.step()
                train_losses.append(loss.item())
                loss_meter += loss.item()
                logits = t.relu(predict)
                pred = logits.data.max(1)[1]
                correct += pred.eq(target.data).sum()
            train_acc = correct.cpu().detach().numpy() * 1.0 / len(
                train_dataloader.dataset)
            # ending of train epoch

            # validation epoch
            if epoch % opt.evaluation_interval == 0:
                if t.cuda.device_count() > 1:
                    model = nn.DataParallel(model, device_ids=[0])
                model.eval()
                loss_meter, correct = 0, 0
                with t.no_grad():
                    print('Validating on %d images:' % len(val_data))
                    for inputs, target, _ in tqdm(val_dataloader):
                        inputs = inputs.to(device)
                        target = target.to(device)
                        output = model(inputs)
                        loss = criterion(output, target)
                        loss_meter += loss.item()
                        valid_losses.append(loss.item())
                        logits = t.relu(output)
                        pred = logits.data.max(1)[1]
                        correct += pred.eq(target.data).sum()
                    val_acc = correct.cpu().detach().numpy() * 1.0 / len(
                        val_dataloader.dataset)
            # ending of validation epoch

            lr_scheduler.step(loss.item())
            writer.add_scalar('learning_rate',
                              lr_scheduler.state_dict()['_last_lr'],
                              global_step=epoch)

            train_loss = np.average(train_losses)
            valid_loss = np.average(valid_losses)
            avg_train_losses.append(train_loss)
            avg_valid_losses.append(valid_loss)

            print('train_loss: %.3f, train_acc: %.3f' %
                  (train_loss, train_acc))
            print('val_loss: %.3f, val_acc: %.3f' % (valid_loss, val_acc))
            writer.add_scalar('train_loss', train_loss, global_step=epoch)
            writer.add_scalar('train_acc', train_acc, global_step=epoch)
            writer.add_scalar('valid_loss', valid_loss, global_step=epoch)
            writer.add_scalar('val_acc', val_acc, global_step=epoch)

            # clear lists to track next epoch
            train_losses.clear()
            valid_losses.clear()

            # early_stopping needs the validation loss to check if it has decresed,
            # and if it has, it will make a checkpoint of the current model
            early_stopping(valid_loss, model)

            if early_stopping.early_stop:
                print("Early stopping")
                break

            if epoch % opt.checkpoint_interval == 0:
                t.save(model.state_dict(),
                       'checkpoints/' + '%s_ckpt_%d.pth' % (opt.model, epoch))

            print_separator()

            # load the last checkpoint with the best model
            model.load_state_dict(
                t.load('checkpoints/' + '%s_final_checkpoint.pth' % opt.model))
Ejemplo n.º 25
0
def train_test(args, data):
    user_history_dict, entity_embedding, relation_embedding, entity_adj, relation_adj, doc_feature_dict, entity_num, position_num, type_num, user2item_train, user2item_test, vert_train, vert_test, local_train, local_test, pop_train, pop_test, item2item_train, item2item_test  = data
    #user2item_train, user2item_test, vert_train, vert_test, local_train, local_test, pop_train, pop_test, item2item_train, item2item_test = data

    train_data_u2i = NewsDataset(user2item_train)
    train_sampler_u2i = RandomSampler(train_data_u2i)
    train_dataloader_u2i = DataLoader(train_data_u2i, sampler=train_sampler_u2i, batch_size=args.batch_size, collate_fn=my_collate_fn, pin_memory=False)

    train_data_vert = NewsDataset(vert_train)
    train_sampler_vert = RandomSampler(train_data_vert)
    train_dataloader_vert = DataLoader(train_data_vert, sampler=train_sampler_vert, batch_size=args.batch_size, pin_memory=False)

    train_data_local = NewsDataset(local_train)
    train_sampler_local = RandomSampler(train_data_local)
    train_dataloader_local = DataLoader(train_data_local, sampler=train_sampler_local, batch_size=args.batch_size, pin_memory=False)

    train_data_pop = NewsDataset(pop_train)
    train_sampler_pop = RandomSampler(train_data_pop)
    train_dataloader_pop = DataLoader(train_data_pop, sampler=train_sampler_pop, batch_size=args.batch_size, pin_memory=False)

    train_data_i2i = NewsDataset(item2item_train)
    train_sampler_i2i = RandomSampler(train_data_i2i)
    train_dataloader_i2i = DataLoader(train_data_i2i, sampler=train_sampler_i2i, batch_size=args.batch_size, pin_memory=False)

    valid_scores = []
    early_stopping = EarlyStopping(patience=2, verbose=True)

    print("learning rate {} l2_regular {}".format(args.learning_rate, args.l2_regular))

    model = KRED(args, user_history_dict, doc_feature_dict, entity_embedding, relation_embedding, entity_adj,
                 relation_adj, entity_num, position_num, type_num).cuda()

    if args.training_type == "multi-task":
        pretrain_epoch = 0
        while(pretrain_epoch < 5):
            model.train()
            criterion = nn.CrossEntropyLoss()
            optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=0)
            total_loss_vert = 0
            model.train()
            for step, batch in enumerate(train_dataloader_vert):
                out = model(batch['item1'], batch['item2'], "vert_classify")[1]
                loss = criterion(out, torch.tensor(batch['label']).cuda())
                total_loss_vert = total_loss_vert + loss
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            print('epoch {} loss {}'.format(pretrain_epoch, total_loss_vert))

            total_loss_pop = 0
            model.train()
            for step, batch in enumerate(train_dataloader_pop):
                out = model(batch['item1'], batch['item2'], "pop_predict")[3]
                loss = criterion(out, torch.tensor(batch['label']).cuda())
                total_loss_pop = total_loss_pop + loss
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            print('epoch {} loss {}'.format(pretrain_epoch, total_loss_pop))

            criterion = nn.BCELoss()
            total_loss_local = 0
            model.train()
            for step, batch in enumerate(train_dataloader_local):
                out = model(batch['item1'], batch['item2'], "local_news")[2]
                loss = criterion(out, torch.tensor(batch['label']).float().cuda())
                total_loss_local = total_loss_local + loss
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            print('epoch {} loss {}'.format(pretrain_epoch, total_loss_local))

            criterion = Softmax_BCELoss(args)
            total_loss_i2i = 0
            model.train()
            for step, batch in enumerate(train_dataloader_i2i):
                out = model(batch['item1'], batch['item2'], "item2item")[4]
                loss = criterion(out, torch.stack(batch['label']).float().cuda())
                total_loss_i2i = total_loss_i2i + loss
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            print('epoch {} loss {}'.format(pretrain_epoch, total_loss_i2i))

            optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.l2_regular)
            total_loss_u2i = 0
            model.train()
            for step, batch in enumerate(train_dataloader_u2i):
                batch = real_batch(batch)
                out = model(batch['item1'], batch['item2'], "user2item")[0]
                loss = criterion(out, torch.tensor(batch['label']).cuda())
                total_loss_u2i = total_loss_u2i + loss
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            print('epoch {} loss {}'.format(pretrain_epoch, total_loss_u2i))
            pretrain_epoch = pretrain_epoch + 1

    for epoch in range(args.epoch):
        if args.task == "user2item":
            test_data = user2item_test
            criterion = Softmax_BCELoss(args)
            train_data_loader = train_dataloader_u2i
            task_index = 0
        elif args.task == "item2item":
            test_data = item2item_test
            criterion = Softmax_BCELoss(args)
            train_data_loader = train_dataloader_i2i
            task_index = 4
        elif args.task == "vert_classify":
            test_data = user2item_test
            criterion = nn.CrossEntropyLoss()
            train_data_loader = train_dataloader_vert
            task_index = 1
        elif args.task == "pop_predict":
            test_data = user2item_test
            criterion = nn.CrossEntropyLoss()
            train_data_loader = train_dataloader_pop
            task_index = 3
        elif args.task == "local_news":
            test_data = user2item_test
            criterion = nn.BCELoss()
            train_data_loader = train_dataloader_local
            task_index = 2
        else:
            print("Error: task name error.")
            break


        optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.l2_regular)
        total_loss = 0
        model.train()
        for step, batch in enumerate(train_data_loader):
            if task_index == 0:
                batch = real_batch(batch)
            if task_index == 4:
                out = model(batch['item1'], batch['item2'], "item2item")[task_index]
                loss = criterion(out, torch.stack(batch['label']).float().cuda())
            elif task_index == 2:
                out = model(batch['item1'], batch['item2'], "local_news")[task_index]
                loss = criterion(out, torch.tensor(batch['label']).float().cuda())
            else:
                out = model(batch['item1'], batch['item2'], args.task)[task_index]
                loss = criterion(out, torch.tensor(batch['label']).cuda())
            total_loss = total_loss + loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print('epoch {} loss {}'.format(epoch, total_loss))

        model.eval()
        y_pred = []
        start_list = list(range(0, len(test_data['label']), args.batch_size))
        for start in start_list:
            if start + args.batch_size <= len(test_data['label']):
                end = start + args.batch_size
            else:
                end = len(test_data['label'])
            #out = model(test_data['user_id'][start:end], test_data['news_id'][start:end], args.task)[task_index].view(end-start).cpu().data.numpy()
            #test = model(test_data['user_id'][start:end], test_data['news_id'][start:end], args.task)[task_index].cpu().data.numpy()
            out = model(test_data['user_id'][start:end], test_data['news_id'][start:end], args.task)[task_index].cpu().data.numpy()

            #y_pred = y_pred + out.tolist()
            y_pred.extend(out)
        truth = test_data['label']
        score = evaulate(y_pred, truth, test_data, args.task)
        valid_scores.append(score)

        early_stopping(score, model)
        if early_stopping.early_stop:
            print("Early stopping")
            break


    model.load_state_dict(torch.load('checkpoint.pt'))
    y_pred = []
    start_list = list(range(0, len(test_data['label']), args.batch_size))
    for start in start_list:
        if start + args.batch_size <= len(test_data['label']):
            end = start + args.batch_size
        else:
            end = len(test_data['label'])
        #out = model(test_data['user_id'][start:end], test_data['news_id'][start:end], args.task)[task_index].view(end - start).cpu().data.numpy()
        out = model(test_data['user_id'][start:end], test_data['news_id'][start:end], args.task)[
            task_index].cpu().data.numpy()
        #y_pred = y_pred + out.tolist()
        y_pred.extend(out)

    result_path = "./result_log/" + args.logdir + '/'
    if not os.path.exists(result_path):
        os.mkdir(result_path)
    result_file_path = result_path + "predict_result.txt"
    fp = open(result_file_path, 'w')
    for line_index in range(len(y_pred)):
        fp.write(str(y_pred[line_index]) + '\t' + str(truth[line_index]) + '\n')
Ejemplo n.º 26
0
class Model:
    """
    This class handles basic methods for handling the model:
    1. Fit the model
    2. Make predictions
    3. Save
    4. Load
    """
    def __init__(self, input_size, n_channels, hparams, gpu, inference=False):

        self.hparams = hparams

        if inference:
            self.device = torch.device('cpu')
            self.model = ECGNet(n_channels=n_channels,
                                hparams=self.hparams).to(self.device)
        else:
            if torch.cuda.device_count() > 1:
                if len(gpu) > 0:
                    print("Number of GPUs will be used: ", len(gpu))
                    self.device = torch.device(f"cuda:{gpu[0]}" if torch.cuda.
                                               is_available() else "cpu")
                    self.model = ECGNet(n_channels=n_channels,
                                        hparams=self.hparams).to(self.device)
                    self.model = DP(self.model,
                                    device_ids=gpu,
                                    output_device=gpu[0])
                else:
                    print("Number of GPUs will be used: ",
                          torch.cuda.device_count() - 5)
                    self.device = torch.device(
                        "cuda:0" if torch.cuda.is_available() else "cpu")
                    self.model = ECGNet(n_channels=n_channels,
                                        hparams=self.hparams).to(self.device)
                    self.model = DP(self.model,
                                    device_ids=list(
                                        range(torch.cuda.device_count() - 5)))
            else:
                self.device = torch.device(
                    "cuda:0" if torch.cuda.is_available() else "cpu")
                self.model = ECGNet(n_channels=n_channels,
                                    hparams=self.hparams).to(self.device)
                print('Only one GPU is available')

        # define the models
        #summary(self.model, (input_size, n_channels))
        #print(torch.cuda.is_available())

        self.metric = Metric()
        self.num_workers = 18
        self.threshold = 0.5

        ########################## compile the model ###############################

        # define optimizer
        self.optimizer = torch.optim.Adam(params=self.model.parameters(),
                                          lr=self.hparams['lr'])

        weights = torch.Tensor([
            1., 1., 1., 1., 0.5, 1., 1., 1., 1., 1., 1., 1., 0.5, 0.5, 1., 1.,
            1., 1., 0.5, 1., 1., 1., 1., 0.5, 1., 1., 0.5
        ]).to(self.device)

        self.loss = nn.BCELoss(weight=weights)  # CompLoss(self.device) #
        self.decoder_loss = nn.MSELoss()

        # define early stopping
        self.early_stopping = EarlyStopping(
            checkpoint_path=self.hparams['checkpoint_path'] + '/checkpoint' +
            str(self.hparams['start_fold']) + '.pt',
            patience=self.hparams['patience'],
            delta=self.hparams['min_delta'],
            is_maximize=True,
        )
        # lr cheduler
        self.scheduler = ReduceLROnPlateau(
            optimizer=self.optimizer,
            mode='max',
            factor=0.2,
            patience=1,
            verbose=True,
            threshold=self.hparams['min_delta'],
            threshold_mode='abs',
            cooldown=0,
            eps=0,
        )

        self.seed_everything(42)

        self.postprocessing = PostProcessing(fold=self.hparams['start_fold'])
        self.scaler = torch.cuda.amp.GradScaler()

    def seed_everything(self, seed):
        np.random.seed(seed)
        os.environ['PYTHONHASHSEED'] = str(seed)
        torch.manual_seed(seed)

    def fit(self, train, valid):

        train_loader = DataLoader(train,
                                  batch_size=self.hparams['batch_size'],
                                  shuffle=True,
                                  num_workers=self.num_workers)
        valid_loader = DataLoader(valid,
                                  batch_size=self.hparams['batch_size'],
                                  shuffle=False,
                                  num_workers=self.num_workers)

        # tensorboard object
        writer = SummaryWriter()

        for epoch in range(self.hparams['n_epochs']):

            # trian the model
            self.model.train()
            avg_loss = 0.0

            train_preds, train_true = torch.Tensor([]), torch.Tensor([])
            for (X_batch, y_batch) in tqdm(train_loader):
                y_batch = y_batch.float().to(self.device)
                X_batch = X_batch.float().to(self.device)

                self.optimizer.zero_grad()
                # get model predictions
                pred, pred_decoder = self.model(X_batch)

                # process loss_1
                pred = pred.view(-1, pred.shape[-1])
                pred = pred**2
                y_batch = y_batch.view(-1, y_batch.shape[-1])
                train_loss = self.loss(pred, y_batch)

                y_batch = y_batch.float().cpu().detach()
                pred = pred.float().cpu().detach()

                # process loss_2
                pred_decoder = pred_decoder.view(-1, pred_decoder.shape[-1])
                X_batch = X_batch.view(-1, X_batch.shape[-1])
                decoder_train_loss = self.decoder_loss(pred_decoder, X_batch)
                X_batch = X_batch.float().cpu().detach()
                pred_decoder = pred_decoder.float().cpu().detach()

                # calc loss
                avg_loss += train_loss.item() / len(train_loader)

                #sum up multi-head losses
                train_loss = train_loss + decoder_train_loss

                self.scaler.scale(
                    train_loss).backward()  # train_loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1)
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1)
                torch.nn.utils.clip_grad_value_(self.model.parameters(), 0.5)
                self.scaler.step(self.optimizer)  # self.optimizer.step()
                self.scaler.update()

                train_true = torch.cat([train_true, y_batch], 0)
                train_preds = torch.cat([train_preds, pred], 0)

            # calc triaing metric
            train_preds = train_preds.numpy()
            train_true = train_true.numpy()

            threshold = self.postprocessing.find_opt_thresold(
                train_true, train_preds)
            self.postprocessing.update_threshold(threshold)
            train_preds = self.postprocessing.run(train_preds)
            metric_train = self.metric.compute(labels=train_true,
                                               outputs=train_preds)

            # evaluate the model
            print('Model evaluation...')
            self.model.eval()
            val_preds, val_true = torch.Tensor([]), torch.Tensor([])
            avg_val_loss = 0.0
            with torch.no_grad():
                for X_batch, y_batch in valid_loader:
                    y_batch = y_batch.float().to(self.device)
                    X_batch = X_batch.float().to(self.device)

                    pred, pred_decoder = self.model(X_batch)
                    pred_decoder = pred_decoder.float().cpu().detach()
                    X_batch = X_batch.float().cpu().detach()

                    pred = pred.reshape(-1, pred.shape[-1])
                    pred = pred**2
                    y_batch = y_batch.view(-1, y_batch.shape[-1])

                    avg_val_loss += self.loss(
                        pred, y_batch).item() / len(valid_loader)
                    y_batch = y_batch.float().cpu().detach()
                    pred = pred.float().cpu().detach()

                    val_true = torch.cat([val_true, y_batch], 0)
                    val_preds = torch.cat([val_preds, pred], 0)

            # evalueate metric
            val_preds = val_preds.numpy()
            val_true = val_true.numpy()
            # val_true, val_preds = self.metric.find_opt_thresold(val_true, val_preds)
            val_preds = self.postprocessing.run(val_preds)
            metric_val = self.metric.compute(val_true, val_preds)

            self.scheduler.step(metric_val)  #avg_val_loss)
            res = self.early_stopping(score=metric_val,
                                      model=self.model,
                                      threshold=threshold)

            # print statistics
            if self.hparams['verbose_train']:
                print(
                    '| Epoch: ',
                    epoch + 1,
                    '| Train_loss: ',
                    avg_loss,
                    '| Val_loss: ',
                    avg_val_loss,
                    '| Metric_train: ',
                    metric_train,
                    '| Metric_val: ',
                    metric_val,
                    '| Current LR: ',
                    self.__get_lr(self.optimizer),
                )

            # # add history to tensorboard
            writer.add_scalars(
                'Loss',
                {
                    'Train_loss': avg_loss,
                    'Val_loss': avg_val_loss
                },
                epoch,
            )

            writer.add_scalars('Metric', {
                'Metric_train': metric_train,
                'Metric_val': metric_val
            }, epoch)

            if res == 2:
                print("Early Stopping")
                print(
                    f'global best max val_loss model score {self.early_stopping.best_score}'
                )
                break
            elif res == 1:
                print(f'save global val_loss model score {metric_val}')

        writer.close()

        self.model = self.early_stopping.load_best_weights()
        self.postprocessing.update_threshold(self.early_stopping.threshold)

        return True

    def predict(self, X_test):

        # evaluate the model
        self.model.eval()

        test_loader = torch.utils.data.DataLoader(
            X_test,
            batch_size=self.hparams['batch_size'],
            shuffle=False,
            num_workers=self.num_workers)  # ,collate_fn=train.my_collate

        test_preds = torch.Tensor([])
        test_val = torch.Tensor([])
        print('Start generation of predictions')
        with torch.no_grad():
            for i, (X_batch, y_batch) in enumerate(tqdm(test_loader)):
                X_batch = X_batch.float().to(self.device)

                pred, pred_decoder = self.model(X_batch)
                pred = pred**2

                X_batch = X_batch.float().cpu().detach()

                test_preds = torch.cat([test_preds, pred.cpu().detach()], 0)
                test_val = torch.cat([test_val, y_batch.cpu().detach()], 0)

        return test_val.numpy(), test_preds.numpy()

    def get_heatmap(self, X_test):

        # evaluate the model
        self.model.eval()

        test_loader = torch.utils.data.DataLoader(
            X_test,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers)  # ,collate_fn=train.my_collate

        test_preds = torch.Tensor([])
        with torch.no_grad():
            for i, (X_batch) in enumerate(test_loader):
                X_batch = X_batch.float().to(self.device)

                pred = self.model.activatations(X_batch)
                pred = torch.sigmoid(pred)
                pred = pred**2
                X_batch = X_batch.float().cpu().detach()

                test_preds = torch.cat([test_preds, pred.cpu().detach()], 0)

        return test_preds.numpy()

    def model_save(self, model_path):
        torch.save(self.model.state_dict(), model_path)
        # self.model.module.state_dict(), PATH
        # torch.save(self.model, model_path)
        return True

    def model_load(self, model_path):
        self.model.load_state_dict(
            torch.load(model_path, map_location=self.device))
        return True

    def model_load_old(self, model_path):
        self.model = torch.load(model_path, map_location=self.device)
        return True

    def inference(self, X, y):

        preprocessing = Preprocessing(aug=False)

        X = preprocessing.run(X, y, label_process=False)

        X = X.reshape(1, -1, X.shape[1])

        self.model.eval()
        predictions, pred_decoder = self.model.forward(torch.Tensor(X))
        predictions = predictions**2
        predictions = predictions.detach().numpy()
        print(np.round(predictions, 3))

        return predictions

    ################## Utils #####################

    def __get_lr(self, optimizer):
        for param_group in optimizer.param_groups:
            return param_group['lr']
def train_velonet(args, dataset):

    training_data, training_label, valid_data, valid_label = create_dataset_Relative_Kinematic(
        args, dataset, windows_size)

    valid_data.requires_grad = False
    valid_label.requires_grad = False

    training_data = quick_std(training_data)
    valid_data = quick_std(valid_data)
    #training_label = quick_norm(training_label)
    #valid_label = quick_norm(valid_label)

    early_stopping = EarlyStopping(patience=35, verbose=True)

    device = torch.device(
        'cuda:0' if torch.cuda.is_available() and not args.cpu else
        'cpu')  #device = torch.device('cpu')

    network = get_Relative_Kinematic().to(device)

    print('Number of train samples: {}'.format(training_data.shape[0]))
    print('Number of val samples: {}'.format(valid_data.shape[0]))

    total_params = network.get_num_params()
    print('Total number of parameters: ', total_params)

    optimizer = torch.optim.Adam(network.parameters(), lr)

    #If after 25 epochs the validation loss did not improve we reduce the learning rate to converge towards optimal solution
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           factor=0.1,
                                                           patience=25,
                                                           verbose=True,
                                                           eps=1e-12)

    if load_model:
        dictionary = torch.load(model_path_VeloNet)
        network.load_state_dict(dictionary.get('model_state_dict'))
        optimizer.load_state_dict(dictionary.get('optimizer_state_dict'))

    start_time = time.time()

    avg_train_losses = []
    avg_valid_losses = []
    writer = SummaryWriter()

    for epoch in range(1, epoch_len + 1):

        train_loss, valid_loss = train_loop_Relative_Kinematic(
            args, dataset, network, device, optimizer, scheduler,
            training_data, valid_data, training_label, valid_label, batch_size,
            writer)

        avg_train_losses.append(train_loss)
        avg_valid_losses.append(valid_loss)

        writer.add_scalars(f'Train_loss/Validation_loss', {
            'Train_loss': train_loss,
            'Valid_loss': valid_loss,
        }, epoch)

        save = early_stopping(avg_valid_losses[-1], network)

        if save:  #Save the model if the validation loss improved
            torch.save(
                {
                    'model_state_dict': network.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'epoch': epoch_len
                }, model_path_VeloNet)
            print('SAVE')

        if early_stopping.early_stop:  #Otherwise if after patience = 35 epoch, the validation loss did not improve we stop the training
            print('Early stopping')
            break

        print("Epoch number : " + str(epoch) + "/" + str(epoch_len))
        print('\tTrain_Loss: {:.9f}'.format(avg_train_losses[-1]))
        print('\tValid_Loss: {:.9f}'.format(avg_valid_losses[-1]))
        print("Amount of time spent for 1 epoch: {}s\n".format(
            int(time.time() - start_time)))
        start_time = time.time()
    writer.close()