Beispiel #1
0
    def __init__(self, option, model, train_dataset, valid_dataset, test_dataset=None, weight=[[1.0, 1.0]],
                 tasks_num=17):
        # Most important variable
        self.option = option
        self.device = torch.device("cuda:{}".format(option['gpu'][0]) if torch.cuda.is_available() else "cpu")
        self.model = DataParallel(model).to(self.device) if option['parallel'] else model.to(self.device)

        # Setting the train valid and test data loader
        if self.option['parallel']:
            self.train_dataloader = DataListLoader(train_dataset, batch_size=self.option['batch_size'], shuffle=True)
            self.valid_dataloader = DataListLoader(valid_dataset, batch_size=self.option['batch_size'])
            if test_dataset: self.test_dataloader = DataListLoader(test_dataset, batch_size=self.option['batch_size'])
        else:
            self.train_dataloader = DataLoader(train_dataset, batch_size=self.option['batch_size'], shuffle=True)
            self.valid_dataloader = DataLoader(valid_dataset, batch_size=self.option['batch_size'])
            if test_dataset: self.test_dataloader = DataLoader(test_dataset, batch_size=self.option['batch_size'])
        self.save_path = self.option['exp_path']
        # Setting the Adam optimizer with hyper-param
        if option['focalloss']:
            self.log('Using FocalLoss')
            self.criterion = [FocalLoss(alpha=1 / w[0]) for w in weight]  # alpha 0.965
        else:
            self.criterion = [torch.nn.CrossEntropyLoss(torch.Tensor(w).to(self.device), reduction='mean') for w in
                              weight]
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.option['lr'],
                                          weight_decay=option['weight_decay'])
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            self.optimizer, mode='min', factor=0.7,
            patience=self.option['lr_scheduler_patience'], min_lr=1e-6
        )

        # other
        self.start = time.time()
        self.tasks_num = tasks_num

        self.records = {'trn_record': [], 'val_record': [], 'val_losses': [],
                        'best_ckpt': None, 'val_roc': [], 'val_prc': []}
        self.log(msgs=['\t{}:{}\n'.format(k, v) for k, v in self.option.items()], show=False)
        self.log('train set num:{}    valid set num:{}    test set num: {}'.format(
            len(train_dataset), len(valid_dataset), len(test_dataset)))
        self.log("total parameters:" + str(sum([p.nelement() for p in self.model.parameters()])))
        self.log(msgs=str(model).split('\n'), show=False)
Beispiel #2
0
def init_model(model_cls,
               log_dir_base,
               fold_no,
               device_ids=None,
               use_gpu=False,
               dp=False,
               ddp=False,
               tb_dir='runs',
               lr=1e-3,
               weight_decay=1e-2):
    writer = SummaryWriter(log_dir=osp.join(tb_dir, log_dir_base))

    model = model_cls(writer)

    writer.add_text('model_summary', model.__repr__())

    optimizer = torch.optim.AdamW(model.parameters(),
                                  lr=lr,
                                  betas=(0.9, 0.999),
                                  eps=1e-08,
                                  weight_decay=weight_decay,
                                  amsgrad=False)
    # scheduler_reduce = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)
    # scheduler = GradualWarmupScheduler(optimizer, multiplier=10, total_epoch=5)
    # scheduler = scheduler_reduce
    # optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay)
    if dp and use_gpu:
        model = model.cuda() if device_ids is None else model.to(device_ids[0])
        model = DataParallel(model, device_ids=device_ids)
    elif use_gpu:
        model = model.to(device_ids[0])

    device_count = torch.cuda.device_count() if dp else 1
    device_count = len(device_ids) if (device_ids is not None
                                       and dp) else device_count

    return model, optimizer, writer, device_count
Beispiel #3
0
def train_RGNN(tr_dataset, te_dataset, n_epochs, batch_size, lr, z_dim, K, dropout, adj_type, learn_edge, lambda1,
               lambda2, domain_adaptation, lambda_dat, label_type, ckpt_save_name=None, ckpt_load=None):
    # log hyper-parameter
    logger.critical('batch_size {}, lr {}, z_dim {}, K {}, dropout {}, adj_type {}, learn_edge {}, lambda1 {},'
                    'lambda2 {}, domain_adaptation {}, lambda_dat {}, label_type {}'
                    .format(batch_size, lr, z_dim, K, dropout, adj_type, learn_edge, lambda1,
                            lambda2, domain_adaptation, lambda_dat, label_type))

    # parameter sanity check
    if label_type not in label_types:
        raise Exception("undefined label_type")
    if adj_type not in adj_types:
        raise Exception("undefined adj_type")

    # construct model
    edge_weight = initial_adjacency_matrix(adj_type)
    model = SymSimGCNNet(n_channels, learn_edge, edge_weight, n_bands, [z_dim], n_classes[label_type],
                         K, dropout, domain_adaptation)
    last_epoch = 0
    if ckpt_load is not None:
        ckpt = torch.load(ckpt_load)
        last_epoch = ckpt_load["epoch"]
        if last_epoch >= n_epochs:
            raise Exception("loaded model have trained >= n_epochs")
        state_dict = ckpt_load["state_dict"]
        model.load_state_dict(state_dict)
    # use multiple GPU
    model = DataParallel(model, device_ids=device_ids).to(device)
    logger.info(model)

    # prepare dataloader
    logger.info("tr_dataset: {}".format(tr_dataset))
    logger.info("te_dataset: {}".format(te_dataset))
    logger.info("training start from epoch {}".format(last_epoch))
    tr_loader = DataListLoader(tr_dataset, batch_size, True)

    # prepare optimizer
    param_list1 = []
    param_list2 = []
    for name, param in model.named_parameters():
        if name in ['module.edge_weight', 'module.conv1.lin.bias', 'module.fc.bias']:
            param_list1.append(param)
        else:
            param_list2.append(param)
    optimizer = torch.optim.Adam([
        {'params': param_list1, 'weight_decay': 0},
        {'params': param_list2, 'weight_decay': lambda2}
    ], lr=lr)
    # iterate over all epochs
    eval_acc_list = []
    macro_f1_list = []
    for ep in range(last_epoch + 1, n_epochs + 1):
        model.train()
        loss_all = 0
        reverse_scale = 2 / (1 + math.exp(-10 * ep / n_epochs)) - 1
        if domain_adaptation == 'RevGrad':
            model.module.alpha = reverse_scale

        # iterate over all graphs
        for tr_data_list in tr_loader:
            # output shape (len(tr_data_list), 5 or 1)
            output, domain_output = model(tr_data_list)
            # classification loss
            # y shape (len(tr_data_list), )
            y = torch.cat([data.y for data in tr_data_list]).to(output.device)
            if label_type == "hard":
                loss = F.cross_entropy(output, y)
            elif label_type == "soft":
                loss = - distribution_label(y) * F.log_softmax(output, dim=1)
                loss = torch.mean(torch.sum(loss, dim=1))
            else:
                loss = F.mse_loss(output, y - 2)
            # l1 regularization loss
            if learn_edge:
                loss += lambda1 * torch.sum(torch.abs(model.module.edge_weight))
            # domain adaptation loss
            if domain_adaptation:
                # tr_data.x: [num_graph * n_channels, feature_dim]
                n_nodes = domain_output.size(0)
                loss += lambda_dat * F.cross_entropy(domain_output, torch.zeros(n_nodes).cuda())
                te_indices = torch.randint(0, len(te_dataset), len(tr_data_list))
                te_data = te_dataset[te_indices]
                _, te_domain_output = model(te_data)
                loss += lambda_dat * F.cross_entropy(te_domain_output, torch.ones(n_nodes).cuda())

            loss_all += loss.item() * len(tr_data_list)
            # optimize the model
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        # evaluate the model
        accuracy, macro_f1_score = evaluate_RGNN(model, te_dataset, label_type)
        eval_acc_list.append(accuracy)
        macro_f1_list.append(macro_f1_score)
        train_acc, _ = evaluate_RGNN(model, tr_dataset, label_type)
        logger.info('epoch: {:4d}; loss: {:9.5f}; train acc: {:9.5f}; eval acc: {:9.5f}; '
                    'macro f1: {:9.5f};'
                    .format(ep, loss_all/len(tr_dataset), train_acc, accuracy, macro_f1_score))

    # save model checkpoint
    logger.info(list(model.parameters()))
    logger.info(format_list(model.module.edge_weight.detach().cpu().numpy().flatten()))
    if ckpt_save_name is not None:
        checkpoint = {"epoch": n_epochs, "state_dict": model.state_dict()}
        torch.save(checkpoint, ckpt_dir + '/' + ckpt_save_name)
    return eval_acc_list, macro_f1_list
Beispiel #4
0
def main(args):
    batch_size = args.batch_size
    model_fname = args.mod_name

    if multi_gpu and batch_size < torch.cuda.device_count():
        exit('Batch size too small')

    # make a folder for the graphs of this model
    Path(args.output_dir).mkdir(exist_ok=True)
    save_dir = osp.join(args.output_dir, model_fname)
    Path(save_dir).mkdir(exist_ok=True)

    # get dataset and split
    gdata = GraphDataset(root=args.input_dir, bb=args.box_num)
    # merge data from separate files into one contiguous array
    bag = []
    for g in gdata:
        bag += g
    random.Random(0).shuffle(bag)
    bag = bag[:args.num_data]
    # temporary patch to use px, py, pz
    for d in bag:
        d.x = d.x[:, :3]
    # 80:10:10 split datasets
    fulllen = len(bag)
    train_len = int(0.8 * fulllen)
    tv_len = int(0.10 * fulllen)
    train_dataset = bag[:train_len]
    valid_dataset = bag[train_len:train_len + tv_len]
    test_dataset = bag[train_len + tv_len:]
    train_samples = len(train_dataset)
    valid_samples = len(valid_dataset)
    test_samples = len(test_dataset)
    if multi_gpu:
        train_loader = DataListLoader(train_dataset,
                                      batch_size=batch_size,
                                      pin_memory=True,
                                      shuffle=True)
        valid_loader = DataListLoader(valid_dataset,
                                      batch_size=batch_size,
                                      pin_memory=True,
                                      shuffle=False)
        test_loader = DataListLoader(test_dataset,
                                     batch_size=batch_size,
                                     pin_memory=True,
                                     shuffle=False)
    else:
        train_loader = DataLoader(train_dataset,
                                  batch_size=batch_size,
                                  pin_memory=True,
                                  shuffle=True)
        valid_loader = DataLoader(valid_dataset,
                                  batch_size=batch_size,
                                  pin_memory=True,
                                  shuffle=False)
        test_loader = DataLoader(test_dataset,
                                 batch_size=batch_size,
                                 pin_memory=True,
                                 shuffle=False)

    # specify loss function
    loss_ftn_obj = LossFunction(args.loss,
                                emd_modname=args.emd_model_name,
                                device=device)

    # create model
    input_dim = 3
    big_dim = 32
    hidden_dim = args.lat_dim
    lr = args.lr
    patience = args.patience

    if args.model == 'MetaLayerGAE':
        model = models.GNNAutoEncoder()
    else:
        if args.model[-3:] == 'EMD':
            model = getattr(models,
                            args.model)(input_dim=input_dim,
                                        big_dim=big_dim,
                                        hidden_dim=hidden_dim,
                                        emd_modname=args.emd_model_name)
        else:
            model = getattr(models, args.model)(input_dim=input_dim,
                                                big_dim=big_dim,
                                                hidden_dim=hidden_dim)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           patience=4)

    valid_losses = []
    train_losses = []
    start_epoch = 0
    n_epochs = 200

    # load in model
    modpath = osp.join(save_dir, model_fname + '.best.pth')
    try:
        model.load_state_dict(torch.load(modpath))
        train_losses, valid_losses, start_epoch = torch.load(
            osp.join(save_dir, 'losses.pt'))
        print('Loaded model')
        best_valid_loss = test(model, valid_loader, valid_samples, batch_size,
                               loss_ftn_obj)
        print(f'Saved model valid loss: {best_valid_loss}')
    except:
        print('Creating new model')
        best_valid_loss = 9999999
    if multi_gpu:
        model = DataParallel(model)
    model.to(torch.device(device))

    # Training loop
    stale_epochs = 0
    loss = best_valid_loss
    for epoch in range(start_epoch, n_epochs):

        if multi_gpu:
            loss = train_parallel(model, optimizer, train_loader,
                                  train_samples, batch_size, loss_ftn_obj)
            valid_loss = test_parallel(model, valid_loader, valid_samples,
                                       batch_size, loss_ftn_obj)
        else:
            loss = train(model, optimizer, train_loader, train_samples,
                         batch_size, loss_ftn_obj)
            valid_loss = test(model, valid_loader, valid_samples, batch_size,
                              loss_ftn_obj)

        scheduler.step(valid_loss)
        train_losses.append(loss)
        valid_losses.append(valid_loss)
        print('Epoch: {:02d}, Training Loss:   {:.4f}'.format(epoch, loss))
        print('               Validation Loss: {:.4f}'.format(valid_loss))

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            print('New best model saved to:', modpath)
            if multi_gpu:
                torch.save(model.module.state_dict(), modpath)
            else:
                torch.save(model.state_dict(), modpath)
            torch.save((train_losses, valid_losses, epoch + 1),
                       osp.join(save_dir, 'losses.pt'))
            stale_epochs = 0
        else:
            stale_epochs += 1
            print(
                f'Stale epoch: {stale_epochs}\nBest: {best_valid_loss}\nCurr: {valid_loss}'
            )
        if stale_epochs >= patience:
            print('Early stopping after %i stale epochs' % patience)
            break

    # model training done
    train_epochs = list(range(epoch + 1))
    early_stop_epoch = epoch - stale_epochs
    loss_curves(train_epochs, early_stop_epoch, train_losses, valid_losses,
                save_dir)

    # compare input and reconstructions
    model.load_state_dict(torch.load(modpath))
    input_fts = []
    reco_fts = []
    for t in valid_loader:
        model.eval()
        if isinstance(t, list):
            for d in t:
                input_fts.append(d.x)
        else:
            input_fts.append(t.x)
            t.to(device)
        reco_out = model(t)
        if isinstance(reco_out, tuple):
            reco_out = reco_out[0]
        reco_fts.append(reco_out.cpu().detach())
    input_fts = torch.cat(input_fts)
    reco_fts = torch.cat(reco_fts)
    plot_reco_difference(
        input_fts, reco_fts, model_fname,
        osp.join(save_dir, 'reconstruction_post_train', 'valid'))

    input_fts = []
    reco_fts = []
    for t in test_loader:
        model.eval()
        if isinstance(t, list):
            for d in t:
                input_fts.append(d.x)
        else:
            input_fts.append(t.x)
            t.to(device)
        reco_out = model(t)
        if isinstance(reco_out, tuple):
            reco_out = reco_out[0]
        reco_fts.append(reco_out.cpu().detach())
    input_fts = torch.cat(input_fts)
    reco_fts = torch.cat(reco_fts)
    plot_reco_difference(
        input_fts, reco_fts, model_fname,
        osp.join(save_dir, 'reconstruction_post_train', 'test'))
    print('Completed')
def GCN(dataset, params, Epochs, MonteSize, width, lr, savepath):
    Batch_size = int(params[0])

    for Monte_iter in range(MonteSize):

        # Data
        best_loss = float('inf')  # best test loss
        start_epoch = 0  # start from epoch 0 or last checkpoint epoch
        TrainConvergence = []
        TestConvergence = []

        # model
        root = '/data/GraphData/' + dataset
        if dataset == 'Cora':
            model_name = "GCN3"
            datasetroot = Planetoid(root=root, name=dataset).shuffle()
            trainloader = DataListLoader(datasetroot,
                                         batch_size=Batch_size,
                                         shuffle=True)
            testloader = DataListLoader(datasetroot,
                                        batch_size=100,
                                        shuffle=False)
            model_to_save = './checkpoint/{}-{}-param_{}_{}-Mon_{}-ckpt.pth'.format(
                dataset, model_name, params[0], params[1], Monte_iter)
            if resume and os.path.exists(model_to_save):
                [net, TrainConvergence, TestConvergence,
                 start_epoch] = ResumeModel(model_to_save)
                if start_epoch >= Epochs - 1:
                    continue

            else:
                net = Net(datasetroot, width)

        elif dataset == 'ENZYMES' or dataset == 'MUTAG':
            model_name = "topk_pool_Net"
            root = '/data/GraphData' + dataset
            datasetroot = TUDataset(root, name=dataset)
            trainloader = DataListLoader(datasetroot,
                                         batch_size=Batch_size,
                                         shuffle=True)
            testloader = DataListLoader(datasetroot,
                                        batch_size=100,
                                        shuffle=False)
            model_to_save = './checkpoint/{}-{}-param_{}_{}-Mon_{}-ckpt.pth'.format(
                dataset, model_name, params[0], params[1], Monte_iter)
            if resume and os.path.exists(model_to_save):
                [net, TrainConvergence, TestConvergence,
                 start_epoch] = ResumeModel(model_to_save)
                if start_epoch >= Epochs - 1:
                    continue

            else:
                net = topk_pool_Net(datasetroot, width)

        elif dataset == 'MNIST':
            datasetroot = MNISTSuperpixels(root='/data/GraphData/' + dataset,
                                           transform=T.Cartesian()).shuffle()
            trainloader = DataListLoader(datasetroot,
                                         batch_size=Batch_size,
                                         shuffle=True)
            testloader = DataListLoader(datasetroot,
                                        batch_size=100,
                                        shuffle=False)
            model_name = 'SPlineNet'
            model_to_save = './checkpoint/{}-{}-param_{}_{}-Mon_{}-ckpt.pth'.format(
                dataset, model_name, params[0], params[1], Monte_iter)

            if resume and os.path.exists(model_to_save):
                [net, TrainConvergence, TestConvergence,
                 start_epoch] = ResumeModel(model_to_save)
                if start_epoch >= Epochs - 1:
                    continue

            else:
                #net=Net(datasetroot,width)
                net = SPlineNet(datasetroot, width)

        elif dataset == 'CIFAR10':
            if resume and os.path.exists(model_to_save):
                [net, TrainConvergence, TestConvergence,
                 start_epoch] = ResumeModel(model_to_save)
                if start_epoch >= Epochs - 1:
                    continue
            else:
                net = getattr(CIFAR10_resnet, 'Resnet20_CIFAR10')(params[1])
        else:
            raise Exception(
                "The dataset is:{}, it isn't existed.".format(dataset))

        print('Let\'s use', torch.cuda.device_count(), 'GPUs!')
        torch.cuda.is_available()
        net = DataParallel(net)
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        net = net.to(device)

        #cudnn.benchmark = True

        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(net.parameters(),
                              lr=lr,
                              momentum=0.9,
                              weight_decay=5e-4)
        for epoch in range(start_epoch, start_epoch + Epochs):
            if epoch < Epochs:
                logging(
                    'Batch size: {},ConCoeff: {},MonteSize:{},epoch:{}'.format(
                        params[0], params[1], Monte_iter, epoch))
                TrainLoss = train(trainloader, net, optimizer, criterion)
                TrainConvergence.append(statistics.mean(TrainLoss))
                TestConvergence.append(
                    statistics.mean(test(testloader, net, criterion)))
            else:
                break
            if TestConvergence[epoch] < best_loss:
                logging('Saving..')
                state = {
                    'net': net.module,
                    'TrainConvergence': TrainConvergence,
                    'TestConvergence': TestConvergence,
                    'epoch': epoch,
                }
                if not os.path.isdir('checkpoint'):
                    os.mkdir('checkpoint')
                torch.save(state, model_to_save)
                best_loss = TestConvergence[epoch]
                if not os.path.exists('./%s' % model_name):
                    os.makedirs('./%s' % model_name)
                torch.save(
                    net.module.state_dict(),
                    './%s/%s_%s_%s_%s_%s_pretrain.pth' %
                    (model_name, dataset, model_name, params[0], params[1],
                     Epochs))
            else:
                pass
            ## save recurrence plots
            if epoch % 20 == 0:
                save_recurrencePlots_file = "../Results/RecurrencePlots/RecurrencePlots_{}_{}_BatchSize{}_ConCoeffi{}_epoch{}.png".format(
                    dataset, model_name, params[0], params[1], epoch)

                save_recurrencePlots(net, save_recurrencePlots_file)

        FileName = "{}-{}-param_{}_{}-monte_{}".format(dataset, model_name,
                                                       params[0], params[1],
                                                       Monte_iter)
        np.save(savepath + 'TrainConvergence-' + FileName, TrainConvergence)
        np.save(savepath + 'TestConvergence-' + FileName, TestConvergence)
        torch.cuda.empty_cache()
        print_nvidia_useage()

    if return_output == True:
        return TestConvergence[-1], net.module.fc.weight
    else:
        pass
Beispiel #6
0
def apply_dataparallel(model, cfgs):
    return DataParallel(model)
Beispiel #7
0
        self.lin1 = torch.nn.Linear(64, 128)
        self.lin2 = torch.nn.Linear(128, dataset.num_classes)

    def forward(self, data):
        print('Inside Model:  num graphs: {}, device: {}'.format(
            data.num_graphs, data.batch.device))

        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        x = F.elu(self.conv1(x, edge_index, edge_attr))
        x = F.elu(self.conv2(x, edge_index, edge_attr))
        x = global_mean_pool(x, data.batch)
        x = F.elu(self.lin1(x))
        return F.log_softmax(self.lin2(x), dim=1)


model = Net()
print('Let\'s use', torch.cuda.device_count(), 'GPUs!')
model = DataParallel(model)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for data_list in loader:
    optimizer.zero_grad()
    output = model(data_list)
    print('Outside Model: num graphs: {}'.format(output.size(0)))
    y = torch.cat([data.y for data in data_list]).to(output.device)
    loss = F.nll_loss(output, y)
    loss.backward()
    optimizer.step()
        id_euc = sphg(pos, 0.6, batch=batch, max_num_neighbors=16)
        x9 = self.conv3(x8, pos, B, N, id_euc)
        x10 = self.lin3(x9.view(B, N, -1))
        x = x10.max(1)[0]  # [B, C]

        return self.fc(x)


# Train and test

model = Net(train_dataset.num_classes)
model = model.to(device)
model.load_state_dict(torch.load('weight.pth',
                                 map_location=f'cuda:{device_list[0]}'),
                      strict=True)
if cuda: model = DataParallel(model, device_ids=device_list)
optimizer = torch.optim.Adam([{
    'params': model.parameters(),
    'initial_lr': base_lr
}],
                             lr=base_lr,
                             weight_decay=1e-4)
# optimizer.load_state_dict(torch.load('geocnn_optimizer.pt', map_location=f'cuda:{device_list[0]}').state_dict())
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                       epoch,
                                                       eta_min=0.00001,
                                                       last_epoch=-1)
criterion = cal_loss


def train(epoch):
Beispiel #9
0
    parameters = list(model.parameters())

    optimizer = torch.optim.Adam(params=parameters, lr=args.lr)
    total_params = sum(p.numel() for param in parameters for p in param)
    print(f'Total number of parameters is {total_params}')
    if args.model == 'DGCNN':
        print(f'SortPooling k is set to {model.k}')
    log_file = os.path.join(args.res_dir, 'log.txt')
    with open(log_file, 'a') as f:
        print(f'Total number of parameters is {total_params}', file=f)
        if args.model == 'DGCNN':
            print(f'SortPooling k is set to {model.k}', file=f)

    start_epoch = 1

    if args.multi_gpu: model = DataParallel(model)
    model = model.to(device)

    if args.continue_from is not None:
        model.load_state_dict(
            torch.load(
                os.path.join(
                    args.res_dir,
                    'model_checkpoint{}.pth'.format(args.continue_from))))
        optimizer.load_state_dict(
            torch.load(
                os.path.join(
                    args.res_dir,
                    'optimizer_checkpoint{}.pth'.format(args.continue_from))))
        start_epoch = args.continue_from + 1
        args.epochs -= args.continue_from
Beispiel #10
0
    def __init__(self,
                 option,
                 model,
                 train_dataset=None,
                 valid_dataset=None,
                 test_dataset=None,
                 weight=[[1.0, 1.0]],
                 tasks_num=1):
        self.option = option
        # self.tasks = ["MUV-466","MUV-548","MUV-600","MUV-644","MUV-652","MUV-689","MUV-692","MUV-712","MUV-713",
        #               "MUV-733","MUV-737","MUV-810","MUV-832","MUV-846","MUV-852","MUV-858","MUV-859"]
        self.tasks_num = tasks_num

        self.save_path = self.option['exp_path']

        self.device = torch.device("cuda:{}".format(0) \
                                       if torch.cuda.is_available() and not option['cpu']  else "cpu")
        self.model = DataParallel(model).to(self.device) \
            if option['parallel'] else model.to(self.device)

        #Setting the train valid and test data loader
        if train_dataset and valid_dataset:
            if self.option['parallel']:
                self.train_dataloader = DataListLoader(train_dataset, \
                                                       batch_size=self.option['batch_size'],shuffle=True)
                self.valid_dataloader = DataListLoader(
                    valid_dataset, batch_size=self.option['batch_size'])
                if test_dataset:
                    self.test_dataloader = DataListLoader(
                        test_dataset, batch_size=self.option['batch_size'])
            else:
                self.train_dataloader = DataLoader(train_dataset, \
                                                   batch_size=self.option['batch_size'],shuffle=True,num_workers=4)
                self.valid_dataloader = DataLoader(
                    valid_dataset,
                    batch_size=self.option['batch_size'],
                    num_workers=4)
                if test_dataset:
                    self.test_dataloader = DataLoader(
                        test_dataset,
                        batch_size=self.option['batch_size'],
                        num_workers=4)
        else:
            self.test_dataset = test_dataset
            if self.option['parallel']:
                self.test_dataloader = DataListLoader(
                    test_dataset,
                    batch_size=self.option['batch_size'],
                    num_workers=0)
            else:
                self.test_dataloader = DataLoader(
                    test_dataset,
                    batch_size=self.option['batch_size'],
                    num_workers=4)

        # Setting the Adam optimizer with hyper-param

        if not option['focalloss']:
            self.criterion = [
                torch.nn.CrossEntropyLoss(torch.Tensor(w).to(self.device),
                                          reduction='mean') for w in weight
            ]
        else:
            self.log('Using FocalLoss')
            self.criterion = [FocalLoss(alpha=1 / w[0])
                              for w in weight]  #alpha 0.965
        self.optimizer = torch.optim.Adam(self.model.parameters(),
                                          lr=self.option['lr'],
                                          weight_decay=option['weight_decay'])
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            self.optimizer,
            mode='min',
            factor=0.7,
            patience=self.option['lr_scheduler_patience'],
            min_lr=1e-6)
        self.start = time.time()
        self.records = {
            'best_epoch': None,
            'val_auc': [],
            'best_val_auc': 0.,
            'best_trn_auc': 0.,
            'best_test_auc': 0.
        }
        self.log(
            msgs=['\t{}:{}\n'.format(k, v) for k, v in self.option.items()],
            show=False)
        if train_dataset:
            self.log(
                'train set num:{}    valid set num:{}    test set num: {}'.
                format(len(train_dataset), len(valid_dataset),
                       len(test_dataset)))
        self.log("total parameters:" +
                 str(sum([p.nelement() for p in self.model.parameters()])))
        self.log(msgs=str(model).split('\n'), show=False)
Beispiel #11
0
def train_cross_validation(model_cls,
                           dataset,
                           num_clusters,
                           dropout=0.0,
                           lr=1e-4,
                           weight_decay=1e-2,
                           num_epochs=200,
                           n_splits=10,
                           use_gpu=True,
                           dp=False,
                           ddp=True,
                           comment='',
                           tb_service_loc='192.168.192.57:6006',
                           batch_size=1,
                           num_workers=0,
                           pin_memory=False,
                           cuda_device=None,
                           fold_no=None,
                           saved_model_path=None,
                           device_ids=None,
                           patience=50,
                           seed=None,
                           save_model=True,
                           c_reg=0,
                           base_log_dir='runs',
                           base_model_save_dir='saved_models'):
    """
    :param c_reg:
    :param save_model: bool
    :param seed:
    :param patience: for early stopping
    :param device_ids: for ddp
    :param saved_model_path:
    :param fold_no:
    :param ddp: DDP
    :param cuda_device:
    :param pin_memory: DataLoader args https://devblogs.nvidia.com/how-optimize-data-transfers-cuda-cc/
    :param num_workers: DataLoader args
    :param model_cls: pytorch Module cls
    :param dataset: pytorch Dataset cls
    :param dropout:
    :param lr:
    :param weight_decay:
    :param num_epochs:
    :param n_splits: number of kFolds
    :param use_gpu: bool
    :param dp: bool
    :param comment: comment in the logs, to filter runs in tensorboard
    :param tb_service_loc: tensorboard service location
    :param batch_size: Dataset args not DataLoader
    :return:
    """
    saved_args = locals()
    seed = int(time.time() % 1e4 * 1e5) if seed is None else seed
    saved_args['random_seed'] = seed

    torch.manual_seed(seed)
    np.random.seed(seed)
    if use_gpu:
        torch.cuda.manual_seed_all(seed)

    if ddp and not torch.distributed.is_initialized():  # initialize ddp
        dist.init_process_group('nccl',
                                init_method='tcp://localhost:{}'.format(
                                    find_open_port()),
                                world_size=1,
                                rank=0)

    model_name = model_cls.__name__

    if not cuda_device:
        if device_ids and (ddp or dp):
            device = device_ids[0]
        else:
            device = torch.device(
                'cuda' if torch.cuda.is_available() and use_gpu else 'cpu')
    else:
        device = cuda_device

    device_count = torch.cuda.device_count() if dp else 1
    device_count = len(device_ids) if (device_ids is not None and
                                       (dp or ddp)) else device_count
    if device_count > 1:
        print("Let's use", device_count, "GPUs!")

    # batch_size = batch_size * device_count

    log_dir_base = get_model_log_dir(comment, model_name)
    if tb_service_loc is not None:
        print("TensorBoard available at http://{1}/#scalars&regexInput={0}".
              format(log_dir_base, tb_service_loc))
    else:
        print("Please set up TensorBoard")

    criterion = nn.CrossEntropyLoss()

    # get test set
    folds = StratifiedKFold(n_splits=n_splits, shuffle=False)
    train_val_idx, test_idx = list(
        folds.split(np.zeros(len(dataset)), dataset.data.y.numpy()))[0]
    test_dataset = dataset.__indexing__(test_idx)
    train_val_dataset = dataset.__indexing__(train_val_idx)

    print("Training {0} {1} models for cross validation...".format(
        n_splits, model_name))
    # folds, fold = KFold(n_splits=n_splits, shuffle=False, random_state=seed), 0
    folds = StratifiedKFold(n_splits=n_splits, shuffle=False)
    iter = folds.split(np.zeros(len(train_val_dataset)),
                       train_val_dataset.data.y.numpy())
    fold = 0

    for train_idx, val_idx in tqdm_notebook(iter, desc='CV', leave=False):

        fold += 1
        if fold_no is not None:
            if fold != fold_no:
                continue

        writer = SummaryWriter(log_dir=osp.join(base_log_dir, log_dir_base +
                                                str(fold)))
        model_save_dir = osp.join(base_model_save_dir,
                                  log_dir_base + str(fold))

        print("creating dataloader tor fold {}".format(fold))

        model = model_cls(writer,
                          num_clusters=num_clusters,
                          in_dim=dataset.data.x.shape[1],
                          out_dim=int(dataset.data.y.max() + 1),
                          dropout=dropout)

        # My Batch

        train_dataset = train_val_dataset.__indexing__(train_idx)
        val_dataset = train_val_dataset.__indexing__(val_idx)

        train_dataset = dataset_gather(
            train_dataset,
            seed=0,
            n_repeat=1,
            n_splits=int(len(train_dataset) / batch_size) + 1)
        val_dataset = dataset_gather(
            val_dataset,
            seed=0,
            n_repeat=1,
            n_splits=int(len(val_dataset) / batch_size) + 1)

        train_dataloader = DataLoader(train_dataset,
                                      shuffle=True,
                                      batch_size=device_count,
                                      collate_fn=lambda data_list: data_list,
                                      num_workers=num_workers,
                                      pin_memory=pin_memory)
        val_dataloader = DataLoader(val_dataset,
                                    shuffle=False,
                                    batch_size=device_count,
                                    collate_fn=lambda data_list: data_list,
                                    num_workers=num_workers,
                                    pin_memory=pin_memory)

        # if fold == 1 or fold_no is not None:
        print(model)
        writer.add_text('model_summary', model.__repr__())
        writer.add_text('training_args', str(saved_args))

        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=lr,
                                     betas=(0.9, 0.999),
                                     eps=1e-08,
                                     weight_decay=weight_decay,
                                     amsgrad=False)
        # optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay)
        if ddp:
            model = model.cuda() if device_ids is None else model.to(
                device_ids[0])
            model = nn.parallel.DistributedDataParallel(model,
                                                        device_ids=device_ids)
        elif dp and use_gpu:
            model = model.cuda() if device_ids is None else model.to(
                device_ids[0])
            model = DataParallel(model, device_ids=device_ids)
        elif use_gpu:
            model = model.to(device)

        if saved_model_path is not None:
            model.load_state_dict(torch.load(saved_model_path))

        best_map, patience_counter, best_score = 0.0, 0, -np.inf
        for epoch in tqdm_notebook(range(1, num_epochs + 1),
                                   desc='Epoch',
                                   leave=False):

            for phase in ['train', 'validation']:

                if phase == 'train':
                    model.train()
                    dataloader = train_dataloader
                else:
                    model.eval()
                    dataloader = val_dataloader

                # Logging
                running_total_loss = 0.0
                running_corrects = 0
                running_reg_loss = 0.0
                running_nll_loss = 0.0
                epoch_yhat_0, epoch_yhat_1 = torch.tensor([]), torch.tensor([])
                epoch_label, epoch_predicted = torch.tensor([]), torch.tensor(
                    [])

                for data_list in tqdm_notebook(dataloader,
                                               desc=phase,
                                               leave=False):

                    # TODO: check devices
                    if dp:
                        data_list = to_cuda(data_list,
                                            (device_ids[0] if device_ids
                                             is not None else 'cuda'))

                    y_hat, reg = model(data_list)
                    # y_hat = y_hat.reshape(batch_size, -1)

                    y = torch.tensor([],
                                     dtype=dataset.data.y.dtype,
                                     device=device)
                    for data in data_list:
                        y = torch.cat([y, data.y.view(-1).to(device)])

                    loss = criterion(y_hat, y)
                    reg_loss = -reg
                    total_loss = (loss + reg_loss * c_reg).sum()

                    if phase == 'train':
                        # print(torch.autograd.grad(y_hat.sum(), model.saved_x, retain_graph=True))
                        optimizer.zero_grad()
                        total_loss.backward(retain_graph=True)
                        nn.utils.clip_grad_norm_(model.parameters(), 2.0)
                        optimizer.step()

                    _, predicted = torch.max(y_hat, 1)
                    label = y

                    running_nll_loss += loss.item()
                    running_total_loss += total_loss.item()
                    running_reg_loss += reg.sum().item()
                    running_corrects += (predicted == label).sum().item()

                    epoch_yhat_0 = torch.cat(
                        [epoch_yhat_0, y_hat[:, 0].detach().view(-1).cpu()])
                    epoch_yhat_1 = torch.cat(
                        [epoch_yhat_1, y_hat[:, 1].detach().view(-1).cpu()])
                    epoch_label = torch.cat(
                        [epoch_label,
                         label.detach().cpu().float()])
                    epoch_predicted = torch.cat(
                        [epoch_predicted,
                         predicted.detach().cpu().float()])

                # precision = sklearn.metrics.precision_score(epoch_label, epoch_predicted, average='micro')
                # recall = sklearn.metrics.recall_score(epoch_label, epoch_predicted, average='micro')
                # f1_score = sklearn.metrics.f1_score(epoch_label, epoch_predicted, average='micro')
                accuracy = sklearn.metrics.accuracy_score(
                    epoch_label, epoch_predicted)
                epoch_total_loss = running_total_loss / dataloader.__len__()
                epoch_nll_loss = running_nll_loss / dataloader.__len__()
                epoch_reg_loss = running_reg_loss / dataloader.dataset.__len__(
                )

                writer.add_scalars(
                    'nll_loss', {'{}_nll_loss'.format(phase): epoch_nll_loss},
                    epoch)
                writer.add_scalars('accuracy',
                                   {'{}_accuracy'.format(phase): accuracy},
                                   epoch)
                # writer.add_scalars('{}_APRF'.format(phase),
                #                    {
                #                        'accuracy': accuracy,
                #                        'precision': precision,
                #                        'recall': recall,
                #                        'f1_score': f1_score
                #                    },
                #                    epoch)
                if epoch_reg_loss != 0:
                    writer.add_scalars(
                        'reg_loss'.format(phase),
                        {'{}_reg_loss'.format(phase): epoch_reg_loss}, epoch)
                # writer.add_histogram('hist/{}_yhat_0'.format(phase),
                #                      epoch_yhat_0,
                #                      epoch)
                # writer.add_histogram('hist/{}_yhat_1'.format(phase),
                #                      epoch_yhat_1,
                #                      epoch)

                # Save Model & Early Stopping
                if phase == 'validation':
                    model_save_path = model_save_dir + '-{}-{}-{:.3f}-{:.3f}'.format(
                        model_name, epoch, accuracy, epoch_nll_loss)
                    if accuracy > best_map:
                        best_map = accuracy
                        model_save_path = model_save_path + '-best'

                    score = -epoch_nll_loss
                    if score > best_score:
                        patience_counter = 0
                        best_score = score
                    else:
                        patience_counter += 1

                    # skip 10 epoch
                    # best_score = best_score if epoch > 10 else -np.inf

                    if save_model:
                        for th, pfix in zip(
                            [0.8, 0.75, 0.7, 0.5, 0.0],
                            ['-perfect', '-great', '-good', '-bad', '-miss']):
                            if accuracy >= th:
                                model_save_path += pfix
                                break
                        if epoch > 10:
                            torch.save(model.state_dict(), model_save_path)

                    writer.add_scalars('best_val_accuracy',
                                       {'{}_accuracy'.format(phase): best_map},
                                       epoch)
                    writer.add_scalars(
                        'best_nll_loss',
                        {'{}_nll_loss'.format(phase): -best_score}, epoch)

                    if patience_counter >= patience:
                        print("Stopped at epoch {}".format(epoch))
                        return

    print("Done !")
Beispiel #12
0
def train_cummunity_detection(model_cls,
                              dataset,
                              dropout=0.0,
                              lr=1e-3,
                              weight_decay=1e-2,
                              num_epochs=200,
                              n_splits=10,
                              use_gpu=True,
                              dp=False,
                              ddp=False,
                              comment='',
                              tb_service_loc='192.168.192.57:6006',
                              batch_size=1,
                              num_workers=0,
                              pin_memory=False,
                              cuda_device=None,
                              ddp_port='23456',
                              fold_no=None,
                              device_ids=None,
                              patience=20,
                              seed=None,
                              save_model=False,
                              supervised=False):
    """
    :param save_model: bool
    :param seed:
    :param patience: for early stopping
    :param device_ids: for ddp
    :param saved_model_path:
    :param fold_no:
    :param ddp_port:
    :param ddp: DDP
    :param cuda_device:
    :param pin_memory: DataLoader args https://devblogs.nvidia.com/how-optimize-data-transfers-cuda-cc/
    :param num_workers: DataLoader args
    :param model_cls: pytorch Module cls
    :param dataset: pytorch Dataset cls
    :param dropout:
    :param lr:
    :param weight_decay:
    :param num_epochs:
    :param n_splits: number of kFolds
    :param use_gpu: bool
    :param dp: bool
    :param comment: comment in the logs, to filter runs in tensorboard
    :param tb_service_loc: tensorboard service location
    :param batch_size: Dataset args not DataLoader
    :return:
    """

    saved_args = locals()
    seed = int(time.time() % 1e4 * 1e5) if seed is None else seed
    saved_args['random_seed'] = seed

    torch.manual_seed(seed)
    np.random.seed(seed)
    if use_gpu:
        torch.cuda.manual_seed_all(seed)

    if ddp and not torch.distributed.is_initialized():  # initialize ddp
        dist.init_process_group(
            'nccl',
            init_method='tcp://localhost:{}'.format(ddp_port),
            world_size=1,
            rank=0)

    model_name = model_cls.__name__

    if not cuda_device:
        if device_ids and (ddp or dp):
            device = device_ids[0]
        else:
            device = torch.device(
                'cuda' if torch.cuda.is_available() and use_gpu else 'cpu')
    else:
        device = cuda_device

    device_count = torch.cuda.device_count() if dp else 1
    device_count = len(device_ids) if (device_ids is not None and
                                       (dp or ddp)) else device_count
    if device_count > 1:
        print("Let's use", device_count, "GPUs!")

    # batch_size = batch_size * device_count

    log_dir_base = get_model_log_dir(comment, model_name)
    if tb_service_loc is not None:
        print("TensorBoard available at http://{1}/#scalars&regexInput={0}".
              format(log_dir_base, tb_service_loc))
    else:
        print("Please set up TensorBoard")

    print("Training {0} {1} models for cross validation...".format(
        n_splits, model_name))
    folds, fold = KFold(n_splits=n_splits, shuffle=False, random_state=seed), 0
    print(dataset.__len__())

    for train_idx, test_idx in tqdm_notebook(folds.split(
            list(range(dataset.__len__())), list(range(dataset.__len__()))),
                                             desc='models',
                                             leave=False):
        fold += 1
        if fold_no is not None:
            if fold != fold_no:
                continue

        writer = SummaryWriter(log_dir=osp.join('runs', log_dir_base +
                                                str(fold)))
        model_save_dir = osp.join('saved_models', log_dir_base + str(fold))

        print("creating dataloader tor fold {}".format(fold))

        model = model_cls(writer, dropout=dropout)

        # My Batch
        train_dataset = dataset.__indexing__(train_idx)
        test_dataset = dataset.__indexing__(test_idx)

        train_dataset = dataset_gather(train_dataset,
                                       n_repeat=1,
                                       n_splits=int(
                                           len(train_dataset) / batch_size))

        train_dataloader = DataLoader(train_dataset,
                                      shuffle=True,
                                      batch_size=device_count,
                                      collate_fn=lambda data_list: data_list,
                                      num_workers=num_workers,
                                      pin_memory=pin_memory)
        test_dataloader = DataLoader(test_dataset,
                                     shuffle=False,
                                     batch_size=device_count,
                                     collate_fn=lambda data_list: data_list,
                                     num_workers=num_workers,
                                     pin_memory=pin_memory)

        print(model)
        writer.add_text('model_summary', model.__repr__())
        writer.add_text('training_args', str(saved_args))

        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=lr,
                                     betas=(0.9, 0.999),
                                     eps=1e-08,
                                     weight_decay=weight_decay,
                                     amsgrad=False)
        # optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay)
        if ddp:
            model = model.cuda() if device_ids is None else model.to(
                device_ids[0])
            model = nn.parallel.DistributedDataParallel(model,
                                                        device_ids=device_ids)
        elif dp and use_gpu:
            model = model.cuda() if device_ids is None else model.to(
                device_ids[0])
            model = DataParallel(model, device_ids=device_ids)
        elif use_gpu:
            model = model.to(device)

        for epoch in tqdm_notebook(range(1, num_epochs + 1),
                                   desc='Epoch',
                                   leave=False):

            for phase in ['train', 'validation']:

                if phase == 'train':
                    model.train()
                    dataloader = train_dataloader
                else:
                    model.eval()
                    dataloader = test_dataloader

                # Logging
                running_total_loss = 0.0
                running_reg_loss = 0.0
                running_overlap = 0.0

                for data_list in tqdm_notebook(dataloader,
                                               desc=phase,
                                               leave=False):

                    # TODO: check devices
                    if dp:
                        data_list = to_cuda(data_list,
                                            (device_ids[0] if device_ids
                                             is not None else 'cuda'))

                    y_hat, reg = model(data_list)

                    y = torch.tensor([],
                                     dtype=dataset.data.y.dtype,
                                     device=device)
                    for data in data_list:
                        y = torch.cat([y, data.y.view(-1).to(device)])

                    if supervised:
                        loss = permutation_invariant_loss(y_hat, y)
                        # criterion = nn.NLLLoss()
                        # loss = criterion(y_hat, y)
                    else:
                        loss = -reg
                    total_loss = loss

                    if phase == 'train':
                        # print(torch.autograd.grad(y_hat.sum(), model.saved_x, retain_graph=True))
                        optimizer.zero_grad()
                        total_loss.backward(retain_graph=True)
                        nn.utils.clip_grad_norm_(model.parameters(), 2.0)
                        optimizer.step()

                    _, predicted = torch.max(y_hat, 1)
                    label = y

                    if supervised:
                        overlap_score = normalized_overlap(
                            label.int().cpu().numpy(),
                            predicted.int().cpu().numpy(), 0.25)
                        # overlap_score = overlap(label.int().cpu().numpy(), predicted.int().cpu().numpy())
                        running_overlap += overlap_score
                        print(reg, overlap_score, loss)

                    running_total_loss += total_loss.item()
                    running_reg_loss += reg.sum().item()

                epoch_total_loss = running_total_loss / dataloader.__len__()
                epoch_reg_loss = running_reg_loss / dataloader.dataset.__len__(
                )
                if supervised:
                    epoch_overlap = running_overlap / dataloader.__len__()
                    writer.add_scalars(
                        'overlap'.format(phase),
                        {'{}_overlap'.format(phase): epoch_overlap}, epoch)

                writer.add_scalars(
                    'reg_loss'.format(phase),
                    {'{}_reg_loss'.format(phase): epoch_reg_loss}, epoch)

    print("Done !")
Beispiel #13
0
def TrainingNet(dataset,modelName,params,num_pre_epochs,num_epochs,NumCutoff,optimizerName,LinkPredictionMethod,MonteSize,savepath):
    Batch_size=params[0]
    VectorPairs=params[4]
    StartTopoCoeffi=params[5]
    WeightCorrectionCoeffi=params[6]
    interval=params[7]
    root='/git/data/GraphData/'+dataset
    TestAccs=[]
    
    for Monte_iter in range(MonteSize):
        # Data
        NewNetworkSizeAdjust=[]
        WeightsDynamicsEvolution=[]
        trainValRatio=[0.2,0.4]
        # model 
        if dataset=='Cora' or dataset =='Citeseer' or dataset =='Pubmed':
            datasetroot= Planetoid(root=root, name=dataset,transform =T.NormalizeFeatures()).shuffle()    
            trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True)

            """            train_mask, val_mask,test_mask=DataSampler(trainValRatio,datasetroot.data.num_nodes)
            DataMask={}
            DataMask['train_mask']=train_mask
            DataMask['val_mask']=val_mask
            DataMask['test_mask']=test_mask
            trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True)"""
            num_features=datasetroot.num_features
            num_classes=datasetroot.num_classes
            criterion = nn.CrossEntropyLoss()


        elif dataset =="CoraFull":
            datasetroot = CoraFull(root=root,transform =T.NormalizeFeatures()).shuffle()
            """train_mask, val_mask,test_mask=DataSampler(trainValRatio,datasetroot.data.num_nodes)
            DataMask={}
            DataMask['train_mask']=train_mask
            DataMask['val_mask']=val_mask
            DataMask['test_mask']=test_mask"""
            criterion = nn.CrossEntropyLoss()
            num_features=datasetroot.num_features
            num_classes=datasetroot.num_classes
            trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=False)


        elif dataset=='ENZYMES' or dataset=='MUTAG':
            datasetroot=TUDataset(root,name=dataset,use_node_attr=True)
            trainloader = DataLoader(datasetroot, batch_size=Batch_size, shuffle=True)
            num_features=datasetroot.num_features
            num_classes=datasetroot.num_classes     
            
            
        elif dataset =="PPI":
            train_dataset = PPI(root, split='train')
            val_dataset = PPI(root, split='val')
            test_dataset = PPI(root, split='test')
            trainloader = DataListLoader(train_dataset, batch_size=Batch_size, shuffle=True)
            valloader = DataListLoader(val_dataset, batch_size=100, shuffle=False)
            testloader = DataListLoader(test_dataset, batch_size=100, shuffle=False)
            num_classes=train_dataset.num_classes
            num_features=train_dataset.num_features
            criterion = torch.nn.BCEWithLogitsLoss()

        elif dataset =="Reddit":
            datasetroot=Reddit(root)   
            trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True)
            testloader = DataListLoader(datasetroot, batch_size=2, shuffle=False)

        elif dataset=="Amazon":
            datasetroot=Amazon(root, "Photo", transform=None, pre_transform=None)
            trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True)
            testloader = DataListLoader(datasetroot, batch_size=100, shuffle=False)

        elif dataset=='MNIST':
            datasetroot = MNISTSuperpixels(root=root, transform=T.Cartesian())
            trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True)
            testloader = DataListLoader(datasetroot, batch_size=100, shuffle=False)

        elif dataset=='CIFAR10':
            pass
        else:
            raise Exception("Input wrong datatset!!")
        
        width=ContractionLayerCoefficients(num_features,*params[1:3])
        net =ChooseModel(modelName,num_features,num_classes,width)    
        FileName="{}-{}-param_{}_{}_{}_{}-monte_{}".format(dataset,modelName,interval,WeightCorrectionCoeffi,StartTopoCoeffi,VectorPairs,Monte_iter)
        print('Let\'s use', torch.cuda.device_count(), 'GPUs!')
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        criterion = criterion.to(device)
        net = DataParallel(net)
        net = net.to(device)
        optimizer = getattr(optim,optimizerName)(net.parameters(), lr=params[3], momentum=0.9, weight_decay=5e-4)


        model_to_save='./checkpoint/{}-{}-param_{}_{}_{}_{}-ckpt.pth'.format(dataset,modelName,params[0],params[1],params[5],params[4])
        if resume=="True" and os.path.exists(model_to_save):
            [net,optimizer,TrainConvergence,TestConvergence,Acc]=ResumeModel(net,optimizer,model_to_save)
            start_epoch=len(TrainConvergence)
        else:
            start_epoch = 0  # start from epoch 0 or last checkpoint epoch         

    
                #cudnn.benchmark = True
        logging('dataset:{}, Batch size: {}, Number of layers:{} ConCoeff: {}, LR:{}, MonteSize:{}'.format(dataset, params[0], params[1],params[2],params[3],Monte_iter))
        mark="{}{}Convergence/DiagElement-{}".format(savepath,dataset,FileName)
        markweights="{}{}Convergence/WeightChanges-{}".format(savepath,dataset,FileName)
                     
        PreTrainConvergence,PreTestConvergence,PreAcc=TrainPart(start_epoch,num_pre_epochs,num_classes,                        trainloader,net,optimizer,criterion,NumCutoff,LinkPredictionMethod,VectorPairs,WeightCorrectionCoeffi,StartTopoCoeffi,mark,markweights,model_to_save,False)
        print('dataset: {}, model name:{}, epoch:{},Pre-train error:{}; Pre-test error:{}; test acc:{}'.format(dataset,modelName,num_pre_epochs,PreTrainConvergence[-1],PreTestConvergence[-1],PreAcc))

        NewNetworksize=RetainNetworkSize(net,params[2])
        OptimizedNet=ChooseModel(modelName,num_features,num_classes,NewNetworksize[0:-1])
        NewNetworksize.insert(0,num_features)
        NewNetworkSizeAdjust.append(NewNetworksize[0:-1])
        print(NewNetworkSizeAdjust)

        #OptimizedNet.apply(init_weights)

        OptimizedNet = DataParallel(OptimizedNet)
        OptimizedNet = OptimizedNet.to(device)
        cudnn.benchmark = True
        # Begin Pre training
        if optimizerName =="SGD":
            optimizerNew = getattr(optim,optimizerName)(OptimizedNet.parameters(), lr=params[3], momentum=0.9, weight_decay=5e-4)
        elif optimizerName =="Adam":
            optimizerNew = getattr(optim,optimizerName)(OptimizedNet.parameters(), lr=params[3], betas=(0.9, 0.999), eps=1e-08, weight_decay=5e-4, amsgrad=False)
        TrainConvergence,TestConvergence,TestAcc=TrainPart(start_epoch,num_epochs,datasetroot.num_classes, trainloader,OptimizedNet,optimizerNew,criterion,
                                                                   NumCutoff,LinkPredictionMethod,VectorPairs,WeightCorrectionCoeffi,StartTopoCoeffi,mark,markweights,model_to_save,True)
        np.save("{}/{}Convergence/AlgebraicConectivityTrainConvergence-{}".format(savepath,dataset,FileName),TrainConvergence)
        np.save("{}/{}Convergence/AlgebraicConectivityTestConvergence-{}".format(savepath,dataset,FileName),TestConvergence)

        #np.save("{}/{}Convergence/NewNetworkSizeAdjust-{}".format(savepath,dataset,FileName),NewNetworkSizeAdjust)

        #torch.cuda.empty_cache()
        print('dataset: {}, model name:{}, resized network size: {}, the train error: {},test error: {}, test acc:{}\n'.format(dataset,modelName,NewNetworksize[0:-1],num_epochs,TrainConvergence[-1],TestConvergence[-1],TestAcc))
        np.save("{}/{}Convergence/AlgebraicConectivityMeanTestAccs-{}".format(savepath,dataset,FileName),TestAccs.append(TestAcc))
        TestAccs.append(TestAcc)
        print_nvidia_useage()
Beispiel #14
0
            y = torch.cat([data.y for data in data]).to(output.device)
            loss = F.nll_loss(output, y)
            epoch_test_loss += loss.detach().item()
            pred = output.max(dim=1)[1]
            correct += pred.eq(y).sum().item()
    return epoch_test_loss / len(test_dataset), correct / len(test_dataset)


t0 = time.time()

num_features = train_dataset.num_features
n_classes = train_dataset.num_classes
#model = GCN_PYG(num_features, 96, n_classes, 0)
model = GAT_PYG(num_features, n_classes, 32, 8, 0.5)
print('Let\'s use', torch.cuda.device_count(), 'GPUs!')
model = DataParallel(model, [1])
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay = 0)
scheduler =  torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
                                                             factor=0.5,
                                                             patience=25,
                                                             verbose=True)

dur = []
for epoch in range(1, 201):
    t1 = time.time()

    train_loss, train_acc, optimizer= train_pyg('gcn', model, train_loader, device, optimizer)

    dur.append(time.time() - t1)
Beispiel #15
0
def test_model(model_path,
               dataset_param, model_param, test_dataset_param,
               batch_size=32, layer_num=None,
               metric_name_list=['relative_loss', 'mse_loss', 'accuracy',
                                 'post_relative_loss', 'post_mse_loss', 'post_accuracy',
                                 'layer_num',]):
    print(model_path)
    print(dataset_param, model_param, test_dataset_param)

    model_filename = osp.basename(model_path)
    new_log_dir = osp.dirname(model_path)
    log_dir_base = osp.basename(new_log_dir)
    log_dir = osp.dirname(osp.dirname(new_log_dir))
    performance_dir = osp.join(log_dir, 'performance')
    if not osp.exists(performance_dir):
        os.mkdir(performance_dir)
    performance_dir = dataset_param2path(performance_dir, test_dataset_param)
    new_performance_dir = osp.join(performance_dir, log_dir_base)
    if not osp.exists(new_performance_dir):
        os.mkdir(new_performance_dir)

    batch_size = batch_size

    parallel_flag = False
    if torch.cuda.is_available():
        device = torch.device('cuda')
        # parallel_flag = torch.cuda.device_count() > 1
    else:
        device = torch.device('cpu')

    cur_model_path = os.path.join(new_log_dir, model_filename)
    cur_performance_dir = osp.join(new_performance_dir, model_filename)
    if not osp.exists(cur_performance_dir):
        os.mkdir(cur_performance_dir)
    if not all([osp.exists(osp.join(cur_performance_dir, ('raw_%s'%metric_name)+('' if layer_num is None else '_'+str(layer_num))+'.csv'))
                for metric_name in metric_name_list]):
        with torch.no_grad():
            test_dataset = param2dataset(test_dataset_param, train_flag=False)
            data_loader_fn = DataListLoader if parallel_flag else DataLoader
            test_data_loader = data_loader_fn(test_dataset,  batch_size)

            net = param2model(test_dataset, model_param)
            net = net.to(device)
            checkpoint = torch.load(cur_model_path, map_location=device)
            net.load_state_dict(checkpoint['model_state_dict'])
            if layer_num is not None:
                try:
                    net.gnn_module.layer_num = layer_num
                except AttributeError:
                    for i in range(len(net.gnn_module_list)):
                        net.gnn_module_list[i].layer_num = layer_num
            if parallel_flag:
                net = DataParallel(net)

            test_metric_list, y_list = evaluate(net, test_data_loader, device,
                                                parallel_flag=parallel_flag,
                                                # post_processing_flag=False,
                                                metric_name_list=metric_name_list)

            for metric_name, metric_list in test_metric_list.items():
                record = np.array([y_list, metric_list])
                np.savetxt(osp.join(cur_performance_dir, ('raw_%s'%metric_name)+('' if layer_num is None else '_'+str(layer_num))+'.csv'), record)

            del test_dataset, test_data_loader
Beispiel #16
0
    # gamma: original image ratio
    for sigma, gamma in tqdm(
        product(
            (0.001, 0.01, 0.05, 0.1, 0.3, 0.5, 1, 2, 5, 10),
            (0, 0.1, 0.3, 0.5, 0.75, 0.9, 0.99),
        )
    ):
        print(
            colorama.Fore.MAGENTA
            + "Testing config - sigma: %.3E, gamma: %.3E" % (sigma, gamma)
        )

        model = NaiveBilateralFilter(fin=6, sigma=sigma, gamma=gamma, k=32)
        if parallel:
            model = DataParallel(model, device_ids=gpu_ids, output_device=gpu_id).to(
                device
            )
        else:
            model = model.to(device)
        model.eval()

        total_mse, total_psnr, orig_psnr = evaluate(model, train_loader, 0)
        records[sigma][gamma] = (total_mse, total_psnr)

        if total_psnr > max_psnr:
            best_sigma, best_gamma = sigma, gamma
            max_psnr, min_mse = total_psnr, total_mse

    print(
        colorama.Fore.GREEN
        + "Max PSNR: %.3f, min MSE: %.3f, ORIG-MSE: %.3f@ sigma: %.3f, gamma: %.3f"
Beispiel #17
0
def test_data_parallel():
    module = DataParallel(None)
    data_list = [Data(x=torch.randn(x, 1)) for x in [2, 3, 10, 4]]
    batches = module.scatter(data_list, device_ids=[0, 1, 0, 1])
    assert len(batches) == 3
    DataLoader(datasets[2][test_size:test_size + train_size],
               batch_size=graphsPerBatch)
]

#print('Can we use GPU? ',torch.cuda.is_available())
# Select which GPUs we can see
#os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#print(device.current_device())
myGCN = multiViewGCN(2, 4, device)

# Use multiple GPUs if we can
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    myGCN = DataParallel(myGCN)

optimizer = torch.optim.Adam(myGCN.parameters(),
                             lr=0.0005)  #, weight_decay=5e-4)

nEpochs = 20


def train():
    myGCN.train()
    loss_all = 0
    loss_func = torch.nn.CrossEntropyLoss()

    for data0, data1, data2 in zip(train_loader[0], train_loader[1],
                                   train_loader[2]):
        data0 = data0.to(device)
                                                          3)).view(-1, 3)

        x1 = self.conv1(pos, batch)
        x2 = self.conv2(x1, batch)
        x3 = self.conv3(x2, batch)
        x4 = self.lin1(x3)
        x5 = global_max_pool(x4, batch)
        x6 = x5.repeat([1, 2048]).view(-1, 1024)
        x7 = torch.cat([x2, x3, x6], dim=1)
        out = self.mlp(x7)
        return F.log_softmax(out, dim=1)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net(train_dataset.num_classes, k=30)
model = DataParallel(model).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.8)


def train():
    model.train()

    total_loss = correct_nodes = total_nodes = 0
    for i, data_list in enumerate(train_loader):
        optimizer.zero_grad()
        out = model(data_list)
        y = torch.cat([data.y for data in data_list]).to(device)
        loss = F.nll_loss(out, y)
        loss.backward()
        optimizer.step()
Beispiel #20
0
def train_cross_validation(model_cls,
                           dataset,
                           dropout=0.0,
                           lr=1e-3,
                           weight_decay=1e-2,
                           num_epochs=200,
                           n_splits=10,
                           use_gpu=True,
                           dp=False,
                           ddp=False,
                           comment='',
                           tb_service_loc='192.168.192.57:6007',
                           batch_size=1,
                           num_workers=0,
                           pin_memory=False,
                           cuda_device=None,
                           tb_dir='runs',
                           model_save_dir='saved_models',
                           res_save_dir='res',
                           fold_no=None,
                           saved_model_path=None,
                           device_ids=None,
                           patience=20,
                           seed=None,
                           fold_seed=None,
                           save_model=False,
                           is_reg=True,
                           live_loss=True,
                           domain_cls=True,
                           final_cls=True):
    """
    :type fold_seed: int
    :param live_loss: bool
    :param is_reg: bool
    :param save_model: bool
    :param seed:
    :param patience: for early stopping
    :param device_ids: for ddp
    :param saved_model_path:
    :param fold_no: int
    :param ddp_port: str
    :param ddp: DDP
    :param cuda_device: list of int
    :param pin_memory: bool, DataLoader args
    :param num_workers: int, DataLoader args
    :param model_cls: pytorch Module cls
    :param dataset: instance
    :param dropout: float
    :param lr: float
    :param weight_decay:
    :param num_epochs:
    :param n_splits: number of kFolds
    :param use_gpu: bool
    :param dp: bool
    :param comment: comment in the logs, to filter runs in tensorboard
    :param tb_service_loc: tensorboard service location
    :param batch_size: Dataset args not DataLoader
    :return:
    """
    saved_args = locals()
    seed = int(time.time() % 1e4 * 1e5) if seed is None else seed
    saved_args['random_seed'] = seed

    torch.manual_seed(seed)
    np.random.seed(seed)
    if use_gpu:
        torch.cuda.manual_seed_all(seed)
        # torch.backends.cudnn.deterministic = True
        # torch.backends.cudnn.benchmark = False

    model_name = model_cls.__name__

    if not cuda_device:
        if device_ids and dp:
            device = device_ids[0]
        else:
            device = torch.device(
                'cuda' if torch.cuda.is_available() and use_gpu else 'cpu')
    else:
        device = cuda_device

    device_count = torch.cuda.device_count() if dp else 1
    device_count = len(device_ids) if (device_ids is not None
                                       and dp) else device_count

    batch_size = batch_size * device_count

    # TensorBoard
    log_dir_base = get_model_log_dir(comment, model_name)
    if tb_service_loc is not None:
        print("TensorBoard available at http://{1}/#scalars&regexInput={0}".
              format(log_dir_base, tb_service_loc))
    else:
        print("Please set up TensorBoard")

    # model
    criterion = nn.NLLLoss()

    print("Training {0} {1} models for cross validation...".format(
        n_splits, model_name))
    # 1
    # folds, fold = KFold(n_splits=n_splits, shuffle=False, random_state=seed), 0
    # 2
    # folds = GroupKFold(n_splits=n_splits)
    # iter = folds.split(np.zeros(len(dataset)), groups=dataset.data.site_id)
    # 4
    # folds = StratifiedKFold(n_splits=n_splits, random_state=fold_seed, shuffle=True if fold_seed else False)
    # iter = folds.split(np.zeros(len(dataset)), dataset.data.y.numpy(), groups=dataset.data.subject_id)
    # 5
    fold = 0
    iter = multi_site_cv_split(dataset.data.y,
                               dataset.data.site_id,
                               dataset.data.subject_id,
                               n_splits,
                               random_state=fold_seed,
                               shuffle=True if fold_seed else False)

    for train_idx, val_idx in tqdm_notebook(iter, desc='CV', leave=False):
        fold += 1
        liveloss = PlotLosses() if live_loss else None

        # for a specific fold
        if fold_no is not None:
            if fold != fold_no:
                continue

        writer = SummaryWriter(log_dir=osp.join('runs', log_dir_base +
                                                str(fold)))
        model_save_dir = osp.join('saved_models', log_dir_base + str(fold))

        print("creating dataloader tor fold {}".format(fold))

        train_dataset, val_dataset = norm_train_val(dataset, train_idx,
                                                    val_idx)

        model = model_cls(writer)

        train_dataloader = DataLoader(train_dataset,
                                      shuffle=True,
                                      batch_size=batch_size,
                                      collate_fn=lambda data_list: data_list,
                                      num_workers=num_workers,
                                      pin_memory=pin_memory)
        val_dataloader = DataLoader(val_dataset,
                                    shuffle=False,
                                    batch_size=batch_size,
                                    collate_fn=lambda data_list: data_list,
                                    num_workers=num_workers,
                                    pin_memory=pin_memory)

        if fold == 1 or fold_no is not None:
            print(model)
            writer.add_text('model_summary', model.__repr__())
            writer.add_text('training_args', str(saved_args))

        optimizer = torch.optim.AdamW(model.parameters(),
                                      lr=lr,
                                      betas=(0.9, 0.999),
                                      eps=1e-08,
                                      weight_decay=weight_decay,
                                      amsgrad=False)
        # scheduler_reduce = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)
        scheduler = GradualWarmupScheduler(optimizer,
                                           multiplier=10,
                                           total_epoch=5)
        # scheduler = scheduler_reduce
        # optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay)
        if dp and use_gpu:
            model = model.cuda() if device_ids is None else model.to(
                device_ids[0])
            model = DataParallel(model, device_ids=device_ids)
        elif use_gpu:
            model = model.to(device)

        if saved_model_path is not None:
            model.load_state_dict(torch.load(saved_model_path))

        best_map, patience_counter, best_score = 0.0, 0, np.inf
        for epoch in tqdm_notebook(range(1, num_epochs + 1),
                                   desc='Epoch',
                                   leave=False):
            logs = {}

            # scheduler.step(epoch=epoch, metrics=best_score)

            for phase in ['train', 'validation']:

                if phase == 'train':
                    model.train()
                    dataloader = train_dataloader
                else:
                    model.eval()
                    dataloader = val_dataloader

                # Logging
                running_total_loss = 0.0
                running_corrects = 0
                running_reg_loss = 0.0
                running_nll_loss = 0.0
                epoch_yhat_0, epoch_yhat_1 = torch.tensor([]), torch.tensor([])
                epoch_label, epoch_predicted = torch.tensor([]), torch.tensor(
                    [])

                logging_hist = True if phase == 'train' else False  # once per epoch
                for data_list in tqdm_notebook(dataloader,
                                               desc=phase,
                                               leave=False):

                    # TODO: check devices
                    if dp:
                        data_list = to_cuda(data_list,
                                            (device_ids[0] if device_ids
                                             is not None else 'cuda'))

                    y_hat, domain_yhat, reg = model(data_list)

                    y = torch.tensor([],
                                     dtype=dataset.data.y.dtype,
                                     device=device)
                    domain_y = torch.tensor([],
                                            dtype=dataset.data.site_id.dtype,
                                            device=device)
                    for data in data_list:
                        y = torch.cat([y, data.y.view(-1).to(device)])
                        domain_y = torch.cat(
                            [domain_y,
                             data.site_id.view(-1).to(device)])

                    loss = criterion(y_hat, y)
                    domain_loss = criterion(domain_yhat, domain_y)
                    # domain_loss = -1e-7 * domain_loss
                    # print(domain_loss.item())
                    if domain_cls:
                        total_loss = domain_loss
                        _, predicted = torch.max(domain_yhat, 1)
                        label = domain_y
                    if final_cls:
                        total_loss = loss
                        _, predicted = torch.max(y_hat, 1)
                        label = y
                    if domain_cls and final_cls:
                        total_loss = (loss + domain_loss).sum()
                        _, predicted = torch.max(y_hat, 1)
                        label = y

                    if is_reg:
                        total_loss += reg.sum()

                    if phase == 'train':
                        # print(torch.autograd.grad(y_hat.sum(), model.saved_x, retain_graph=True))
                        optimizer.zero_grad()
                        total_loss.backward()
                        nn.utils.clip_grad_norm_(model.parameters(), 2.0)
                        optimizer.step()

                    running_nll_loss += loss.item()
                    running_total_loss += total_loss.item()
                    running_reg_loss += reg.sum().item()
                    running_corrects += (predicted == label).sum().item()

                    epoch_yhat_0 = torch.cat(
                        [epoch_yhat_0, y_hat[:, 0].detach().view(-1).cpu()])
                    epoch_yhat_1 = torch.cat(
                        [epoch_yhat_1, y_hat[:, 1].detach().view(-1).cpu()])
                    epoch_label = torch.cat(
                        [epoch_label,
                         label.detach().float().view(-1).cpu()])
                    epoch_predicted = torch.cat([
                        epoch_predicted,
                        predicted.detach().float().view(-1).cpu()
                    ])

                # precision = sklearn.metrics.precision_score(epoch_label, epoch_predicted, average='micro')
                # recall = sklearn.metrics.recall_score(epoch_label, epoch_predicted, average='micro')
                # f1_score = sklearn.metrics.f1_score(epoch_label, epoch_predicted, average='micro')
                accuracy = sklearn.metrics.accuracy_score(
                    epoch_label, epoch_predicted)
                epoch_total_loss = running_total_loss / dataloader.__len__()
                epoch_nll_loss = running_nll_loss / dataloader.__len__()
                epoch_reg_loss = running_reg_loss / dataloader.__len__()

                # print('epoch {} {}_nll_loss: {}'.format(epoch, phase, epoch_nll_loss))
                writer.add_scalars(
                    'nll_loss', {'{}_nll_loss'.format(phase): epoch_nll_loss},
                    epoch)
                writer.add_scalars('accuracy',
                                   {'{}_accuracy'.format(phase): accuracy},
                                   epoch)
                # writer.add_scalars('{}_APRF'.format(phase),
                #                    {
                #                        'accuracy': accuracy,
                #                        'precision': precision,
                #                        'recall': recall,
                #                        'f1_score': f1_score
                #                    },
                #                    epoch)
                if epoch_reg_loss != 0:
                    writer.add_scalars(
                        'reg_loss'.format(phase),
                        {'{}_reg_loss'.format(phase): epoch_reg_loss}, epoch)
                # print(epoch_reg_loss)
                # writer.add_histogram('hist/{}_yhat_0'.format(phase),
                #                      epoch_yhat_0,
                #                      epoch)
                # writer.add_histogram('hist/{}_yhat_1'.format(phase),
                #                      epoch_yhat_1,
                #                      epoch)

                # Save Model & Early Stopping
                if phase == 'validation':
                    model_save_path = model_save_dir + '-{}-{}-{:.3f}-{:.3f}'.format(
                        model_name, epoch, accuracy, epoch_nll_loss)
                    # best score
                    if accuracy > best_map:
                        best_map = accuracy
                        model_save_path = model_save_path + '-best'

                    score = epoch_nll_loss
                    if score < best_score:
                        patience_counter = 0
                        best_score = score
                    else:
                        patience_counter += 1

                    # skip first 10 epoch
                    # best_score = best_score if epoch > 10 else -np.inf

                    if save_model:
                        for th, pfix in zip(
                            [0.8, 0.75, 0.7, 0.5, 0.0],
                            ['-perfect', '-great', '-good', '-bad', '-miss']):
                            if accuracy >= th:
                                model_save_path += pfix
                                break

                        torch.save(model.state_dict(), model_save_path)

                    writer.add_scalars('best_val_accuracy',
                                       {'{}_accuracy'.format(phase): best_map},
                                       epoch)
                    writer.add_scalars(
                        'best_nll_loss',
                        {'{}_nll_loss'.format(phase): best_score}, epoch)

                    writer.add_scalars('learning_rate', {
                        'learning_rate':
                        scheduler.optimizer.param_groups[0]['lr']
                    }, epoch)

                    if patience_counter >= patience:
                        print("Stopped at epoch {}".format(epoch))
                        return

                if live_loss:
                    prefix = ''
                    if phase == 'validation':
                        prefix = 'val_'

                    logs[prefix + 'log loss'] = epoch_nll_loss
                    logs[prefix + 'accuracy'] = accuracy
            if live_loss:
                liveloss.update(logs)
                liveloss.draw()

    print("Done !")
Beispiel #21
0
    def __init__(self,
                 option,
                 model,
                 train_dataset,
                 valid_dataset,
                 test_dataset=None):
        self.option = option
        self.device = torch.device("cuda:{}".format(option['cuda_devices'][0]) \
                                       if torch.cuda.is_available() else "cpu")
        self.model = DataParallel(model, device_ids=self.option['cuda_devices']).to(self.device) \
            if option['parallel'] else model.to(self.device)

        # Setting the train valid and test data loader
        if self.option['parallel']:
            self.train_dataloader = DataListLoader(train_dataset, \
                                                   batch_size=self.option['train_batch'])
            self.valid_dataloader = DataListLoader(valid_dataset,
                                                   batch_size=64)
            if test_dataset:
                self.test_dataloader = DataListLoader(test_dataset,
                                                      batch_size=64)
        else:
            self.train_dataloader = DataLoader(train_dataset, \
                                               batch_size=self.option['train_batch'])
            self.valid_dataloader = DataLoader(valid_dataset, batch_size=64)
            if test_dataset:
                self.test_dataloader = DataLoader(test_dataset, batch_size=64)

        # Setting the Adam optimizer with hyper-param
        self.criterion = torch.nn.L1Loss()
        self.optimizer = torch.optim.Adam(self.model.parameters(),
                                          lr=self.option['lr'])
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            self.optimizer,
            mode='min',
            factor=0.7,
            patience=self.option['lr_scheduler_patience'],
            min_lr=0.0000001)

        # other
        self.start = time.time()
        self.save_id = ''.join(
            random.sample('zyxwvutsrqponmlkjihgfedcba1234567890', 4))
        self.abs_file_dir = os.path.dirname(os.path.abspath(__file__))
        self.ckpt_save_dir = os.path.join(
            self.abs_file_dir, 'ckpt',
            'ckpts_task{}_{}'.format(self.option['task'], self.save_id))
        self.log_save_path = os.path.join(
            self.abs_file_dir, 'log',
            'log_task{}_{}.txt'.format(self.option['task'], self.save_id))
        self.record_save_path = os.path.join(
            self.abs_file_dir, 'record',
            'record_task{}_{}.csv'.format(self.option['task'], self.save_id))
        os.system('mkdir -p log record {}'.format(self.ckpt_save_dir))
        self.records = {
            'trn_record': [],
            'val_record': [],
            'val_losses': [],
            'best_ckpt': None
        }
        self.log(
            msgs=['\t{}:{}\n'.format(k, v) for k, v in self.option.items()])
        self.log('save id: {}'.format(self.save_id))
        self.log('train set num:{} valid set num:{} test set num: {}'.format(
            len(train_dataset), len(valid_dataset), len(test_dataset)))
        self.log("Total Parameters:" +
                 str(sum([p.nelement() for p in self.model.parameters()])))
Beispiel #22
0
def model_training(data_list_train, data_list_test, epochs, acc_epoch, acc_epoch2, save_model_epochs, validation_epoch, batchsize, logfilename, load_checkpoint= None):
        
    #logging
    logging.basicConfig(level=logging.DEBUG, filename='./logfiles/'+logfilename, filemode="w+",
                        format="%(message)s")
    trainloader = DataListLoader(data_list_train, batch_size=batchsize, shuffle=True)
    testloader = DataListLoader(data_list_test, batch_size=batchsize, shuffle=True)
    device = torch.device('cuda')
    complete_net = completeNet()
    complete_net = DataParallel(complete_net)
    complete_net = complete_net.to(device)
    
    #train parameters
    weights = [10, 1]
    optimizer = torch.optim.Adam(complete_net.parameters(), lr=0.001, weight_decay=0.001)

    #resume training
    initial_epoch=1
    if load_checkpoint!=None:
        checkpoint = torch.load(load_checkpoint)
        complete_net.load_state_dict(checkpoint['model_state_dict'], strict=False)
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        initial_epoch = checkpoint['epoch']+1
        loss = checkpoint['loss']
    
    complete_net.train()

    for epoch in range(initial_epoch, epochs+1):
        epoch_total=0
        epoch_total_ones= 0
        epoch_total_zeros= 0
        epoch_correct=0
        epoch_correct_ones= 0
        epoch_correct_zeros= 0
        running_loss= 0
        batches_num=0         
        for batch in trainloader:
            batch_total=0
            batch_total_ones= 0
            batch_total_zeros= 0
            batch_correct= 0
            batch_correct_ones= 0
            batch_correct_zeros= 0
            batches_num+=1
            # Forward-Backpropagation
            output, output2, ground_truth, ground_truth2, det_num, tracklet_num= complete_net(batch)
            optimizer.zero_grad()
            loss = weighted_binary_cross_entropy(output, ground_truth, weights)
            loss.backward()
            optimizer.step()
            ##Accuracy 
            if epoch%acc_epoch==0 and epoch!=0:
                # Hungarian method, clean up
                cleaned_output= hungarian(output2, ground_truth2, det_num, tracklet_num)
                batch_total += cleaned_output.size(0)
                ones= torch.tensor([1 for x in cleaned_output]).to(device)
                zeros = torch.tensor([0 for x in cleaned_output]).to(device)
                batch_total_ones += (cleaned_output == ones).sum().item()
                batch_total_zeros += (cleaned_output == zeros).sum().item()
                batch_correct += (cleaned_output == ground_truth2).sum().item()
                temp1 = (cleaned_output == ground_truth2)
                temp2 = (cleaned_output == ones)
                batch_correct_ones += (temp1 & temp2).sum().item()
                temp3 = (cleaned_output == zeros)
                batch_correct_zeros += (temp1 & temp3).sum().item()
                epoch_total += batch_total
                epoch_total_ones += batch_total_ones
                epoch_total_zeros += batch_total_zeros
                epoch_correct += batch_correct
                epoch_correct_ones += batch_correct_ones
                epoch_correct_zeros += batch_correct_zeros
            if loss.item()!=loss.item():
                print("Error")
                break
            if batch_total_ones != 0 and batch_total_zeros != 0 and epoch%acc_epoch==0 and epoch!=0:
                print('Epoch: [%d] | Batch: [%d] | Training_Loss: %.3f | Total_Accuracy: %.3f | Ones_Accuracy: %.3f | Zeros_Accuracy: %.3f |' %
                      (epoch, batches_num, loss.item(), 100 * batch_correct / batch_total, 100 * batch_correct_ones / batch_total_ones,
                       100 * batch_correct_zeros / batch_total_zeros))
                logging.info('Epoch: [%d] | Batch: [%d] | Training_Loss: %.3f | Total_Accuracy: %.3f | Ones_Accuracy: %.3f | Zeros_Accuracy: %.3f |' %
                      (epoch, batches_num, loss.item(), 100 * batch_correct / batch_total, 100 * batch_correct_ones / batch_total_ones,
                       100 * batch_correct_zeros / batch_total_zeros))
            else:
                print('Epoch: [%d] | Batch: [%d] | Training_Loss: %.3f |' %
                        (epoch, batches_num, loss.item()))
                logging.info('Epoch: [%d] | Batch: [%d] | Training_Loss: %.3f |' %
                        (epoch, batches_num, loss.item()))
            running_loss += loss.item()
        if loss.item()!=loss.item():
                print("Error")
                break
        if epoch_total_ones!=0 and epoch_total_zeros!=0 and epoch%acc_epoch==0 and epoch!=0:
            print('Epoch: [%d] | Training_Loss: %.3f | Total_Accuracy: %.3f | Ones_Accuracy: %.3f | Zeros_Accuracy: %.3f |' %
                      (epoch, running_loss / batches_num, 100 * epoch_correct / epoch_total, 100 * \
                          epoch_correct_ones / epoch_total_ones, 100 * epoch_correct_zeros / epoch_total_zeros))
            logging.info('Epoch: [%d] | Training_Loss: %.3f | Total_Accuracy: %.3f | Ones_Accuracy: %.3f | Zeros_Accuracy: %.3f |' %
                      (epoch, running_loss / batches_num, 100 * epoch_correct / epoch_total, 100 * \
                          epoch_correct_ones / epoch_total_ones, 100 * epoch_correct_zeros / epoch_total_zeros))
        else:
            print('Epoch: [%d] | Training_Loss: %.3f |' %
                        (epoch, running_loss / batches_num))
            logging.info('Epoch: [%d] | Training_Loss: %.3f |' %
                        (epoch, running_loss / batches_num))
        # save model
        if epoch%save_model_epochs==0 and epoch!=0:
            torch.save({ 
                        'epoch': epoch,
                        'model_state_dict': complete_net.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'loss': running_loss,
                        }, './models/epoch_'+str(epoch)+'.pth')

        #validation
        if epoch%validation_epoch==0 and epoch!=0:
            with torch.no_grad():
                epoch_total=0
                epoch_total_ones= 0
                epoch_total_zeros= 0
                epoch_correct=0
                epoch_correct_ones= 0
                epoch_correct_zeros= 0
                running_loss= 0
                batches_num=0
                for batch in testloader:
                    batch_total=0
                    batch_total_ones= 0
                    batch_total_zeros= 0
                    batch_correct= 0
                    batch_correct_ones= 0
                    batch_correct_zeros= 0
                    batches_num+=1
                    output, output2, ground_truth, ground_truth2, det_num, tracklet_num = complete_net(batch)
                    loss = weighted_binary_cross_entropy(output, ground_truth, weights)
                    running_loss += loss.item()
                    ##Accuracy 
                    if epoch%acc_epoch2==0 and epoch!=0:
                        # Hungarian method, clean up
                        cleaned_output= hungarian(output2, ground_truth2, det_num, tracklet_num)
                        batch_total += cleaned_output.size(0)
                        ones= torch.tensor([1 for x in cleaned_output]).to(device)
                        zeros = torch.tensor([0 for x in cleaned_output]).to(device)
                        batch_total_ones += (cleaned_output == ones).sum().item()
                        batch_total_zeros += (cleaned_output == zeros).sum().item()
                        batch_correct += (cleaned_output == ground_truth2).sum().item()
                        temp1 = (cleaned_output == ground_truth2)
                        temp2 = (cleaned_output == ones)
                        batch_correct_ones += (temp1 & temp2).sum().item()
                        temp3 = (cleaned_output == zeros)
                        batch_correct_zeros += (temp1 & temp3).sum().item()
                        epoch_total += batch_total
                        epoch_total_ones += batch_total_ones
                        epoch_total_zeros += batch_total_zeros
                        epoch_correct += batch_correct
                        epoch_correct_ones += batch_correct_ones
                        epoch_correct_zeros += batch_correct_zeros
                if epoch_total_ones!=0 and epoch_total_zeros!=0 and epoch%acc_epoch2==0 and epoch!=0:
                    print('Epoch: [%d] | Validation_Loss: %.3f | Total_Accuracy: %.3f | Ones_Accuracy: %.3f | Zeros_Accuracy: %.3f |' %
                                (epoch, running_loss / batches_num, 100 * epoch_correct / epoch_total, 100 * \
                                    epoch_correct_ones / epoch_total_ones, 100 * epoch_correct_zeros / epoch_total_zeros))
                    logging.info('Epoch: [%d] | Validation_Loss: %.3f | Total_Accuracy: %.3f | Ones_Accuracy: %.3f | Zeros_Accuracy: %.3f |' %
                                (epoch, running_loss / batches_num, 100 * epoch_correct / epoch_total, 100 * \
                                    epoch_correct_ones / epoch_total_ones, 100 * epoch_correct_zeros / epoch_total_zeros))
                else:
                    print('Epoch: [%d] | Validation_Loss: %.3f |' %
                                (epoch, running_loss / batches_num))
                    logging.info('Epoch: [%d] | Validation_Loss: %.3f |' %
                                (epoch, running_loss / batches_num))
def GCN(dataset, params, num_pre_epochs, num_epochs, MonteSize, PruningTimes,
        width, lr, savepath):
    Batch_size = int(params[0])
    for Monte_iter in range(MonteSize):
        # Data
        best_loss = float('inf')  # best test loss
        start_epoch = 0  # start from epoch 0 or last checkpoint epoch
        TrainConvergence = []
        TestConvergence = []

        # model
        root = '/git/data/GraphData/' + dataset

        if dataset == 'Cora':
            model_name = "PruningGCN"
            datasetroot = Planetoid(root=root, name=dataset).shuffle()
            trainloader = DataListLoader(datasetroot,
                                         batch_size=Batch_size,
                                         shuffle=True)
            testloader = DataListLoader(datasetroot,
                                        batch_size=100,
                                        shuffle=False)
            model_to_save = './checkpoint/{}-{}-param_{}_{}_{}_{}-Monte_{}-ckpt.pth'.format(
                dataset, model_name, params[0], params[1], params[2],
                params[3], Monte_iter)
            if Monte_iter == 0:
                if resume == True and os.path.exists(model_to_save):
                    [
                        OptimizedNet, NewNetworksize, TrainConvergence,
                        TestConvergence, start_epoch
                    ] = ResumeModel(model_to_save)
                    if start_epoch >= num_epochs - 1:
                        continue
                else:
                    net = Net(datasetroot, width)
                    #net.apply(weight_reset)

        elif dataset == 'ENZYMES' or dataset == 'MUTAG':
            model_name = "topk_pool_Net"
            datasetroot = TUDataset(root, name=dataset)
            trainloader = DataListLoader(datasetroot,
                                         batch_size=Batch_size,
                                         shuffle=True)
            testloader = DataListLoader(datasetroot,
                                        batch_size=100,
                                        shuffle=False)
            model_to_save = './checkpoint/{}-{}-param_{}_{}_{}_{}-ckpt.pth'.format(
                dataset, model_name, params[0], params[1], params[2],
                params[3])
            if Monte_iter == 0:
                if resume == True and os.path.exists(model_to_save):
                    [
                        OptimizedNet, NewNetworksize, TrainConvergence,
                        TestConvergence, start_epoch
                    ] = ResumeModel(model_to_save)
                    if start_epoch >= num_epochs - 1:
                        continue

                else:
                    net = topk_pool_Net(datasetroot, width)

        elif dataset == 'MNIST':
            datasetroot = MNISTSuperpixels(root=root,
                                           transform=T.Cartesian()).shuffle()
            trainloader = DataListLoader(datasetroot,
                                         batch_size=Batch_size,
                                         shuffle=True)
            testloader = DataListLoader(datasetroot,
                                        batch_size=100,
                                        shuffle=False)
            model_name = 'SPlineNet'
            model_to_save = './checkpoint/{}-{}-param_{}_{}_{}_{}-Mon_{}-ckpt.pth'.format(
                dataset, model_name, params[0], params[1], params[2],
                params[3], Monte_iter)
            if resume == "True" and os.path.exists(model_to_save):
                [net, TrainConvergence, TestConvergence,
                 start_epoch] = ResumeModel(model_to_save)
                if start_epoch >= num_epochs - 1:
                    continue

            else:
                #net=Net(datasetroot,width)
                net = SPlineNet(datasetroot, width)

        elif dataset == 'CIFAR10':
            if resume == "True" and os.path.exists(model_to_save):
                [net, TrainConvergence, TestConvergence,
                 start_epoch] = ResumeModel(model_to_save)
                if start_epoch >= num_epochs - 1:
                    continue
            else:
                net = getattr(CIFAR10_resnet, 'Resnet20_CIFAR10')(params[1])
        else:
            raise Exception(
                "The dataset is:{}, it isn't existed.".format(dataset))

        if Monte_iter == 0 and start_epoch == 0:
            print('Let\'s use', torch.cuda.device_count(), 'GPUs!')
            net = DataParallel(net)
            device = torch.device(
                'cuda' if torch.cuda.is_available() else 'cpu')
            net = net.to(device)
            #cudnn.benchmark = True
            criterion = nn.CrossEntropyLoss()
            optimizer = optim.SGD(net.parameters(),
                                  lr=lr,
                                  momentum=0.9,
                                  weight_decay=5e-4)
            logging(
                'Batch size: {}, Number of layers:{} ConCoeff: {}, CutoffCoffi:{}, MonteSize:{}'
                .format(params[0], params[1], params[2], params[3],
                        Monte_iter))
            for epoch in range(num_pre_epochs):
                PreTrainLoss = train(trainloader, net, optimizer, criterion)
                print('\nEpoch: {},  Average pre-tain loss: {:.4f} \n'.format(
                    epoch, PreTrainLoss[0]))
                NewNetworksize = RetainNetworkSize(net, params[2])
            del net

        #NewNetworksize=width

        for pruningIter in range(PruningTimes):
            if pruningIter > 0:
                [
                    OptimizedNet, NewNetworksize, TrainConvergence,
                    TestConvergence, start_epoch
                ] = ResumeModel(model_to_save)
            elif dataset == 'Cora' and start_epoch == 0:
                OptimizedNet = Net(datasetroot, NewNetworksize[0:-1])
            elif dataset == 'ENZYMES' and start_epoch == 0:
                NewNetworkSizeAdjust = NewNetworksize[0:-1]
                NewNetworkSizeAdjust[0] = width[0] - 1
                OptimizedNet = topk_pool_Net(datasetroot, NewNetworkSizeAdjust)

            OptimizedNet = DataParallel(OptimizedNet)
            device = torch.device(
                'cuda' if torch.cuda.is_available() else 'cpu')
            OptimizedNet = OptimizedNet.to(device)
            cudnn.benchmark = True
            criterionNew = nn.CrossEntropyLoss()
            optimizerNew = optim.SGD(OptimizedNet.parameters(),
                                     lr=lr,
                                     momentum=0.9,
                                     weight_decay=5e-4)
            criterion = nn.CrossEntropyLoss()

            for epoch in range(start_epoch, num_epochs):
                TrainLoss = train(trainloader, OptimizedNet, optimizerNew,
                                  criterionNew)
                print('\n Epoch: {}, Average tain loss: {:.4f} \n'.format(
                    epoch, TrainLoss[0]))
                TrainConvergence.append(statistics.mean(TrainLoss))
                NewNetworksize = RetainNetworkSize(OptimizedNet, params[2])
                TestConvergence.append(
                    statistics.mean(test(testloader, OptimizedNet, criterion)))

                # save model
                if TestConvergence[epoch] < best_loss:
                    logging('Saving..')
                    state = {
                        'net': OptimizedNet.module,
                        'TrainConvergence': TrainConvergence,
                        'TestConvergence': TestConvergence,
                        'epoch': num_epochs,
                        'NewNetworksize': NewNetworksize[0:-1],
                    }
                    if not os.path.isdir('checkpoint'):
                        os.mkdir('checkpoint')
                    torch.save(state, model_to_save)
                    best_loss = TestConvergence[epoch]

                ## save recurrence plots
            """if epoch%20==0:
                save_recurrencePlots_file="../Results/RecurrencePlots/RecurrencePlots_{}_{}_BatchSize{}    \_ConCoeffi{}_epoch{}.png".format(dataset, model_name,params[0],params[1],epoch)

            save_recurrencePlots(net,save_recurrencePlots_file)"""

        FileName = "{}-{}-param_{}_{}_{}_{}-monte_{}".format(
            dataset, model_name, params[0], params[1], params[2], params[3],
            Monte_iter)
        np.save(savepath + 'TrainConvergence-' + FileName, TrainConvergence)
        #np.save(savepath+'TestConvergence-'+FileName,TestConvergence)
        #torch.cuda.empty_cache()
        print_nvidia_useage()

    if return_output == True:
        return TestConvergence[-1], net.module.fc.weight
    else:
        pass