Example #1
0
  def train(self):
    logger.debug('starting training')

    train_dataset = eval(self.dataset_conf.loader_name)(self.config, self.graphs_train, tag='train')
    
    #Get start and end tokens

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=self.batch_size,
        shuffle=self.train_conf.shuffle,
        num_workers=self.train_conf.num_workers,
        drop_last=False
        )

    model = eval(self.model_conf.model_name)(
      self.config,
      train_dataset.n_letters,
      train_dataset.seq_len
      )

    #move to gpu and parallelize
    if self.use_gpu:
      model = data_parallel.DataParallel(model, device_ids=self.gpus).to(self.device)

    model_params = filter(lambda p: p.requires_grad, model.parameters())

    #Setup optimizer
    if self.train_conf.optimizer == 'SGD':
      optimizer = optim.SGD(
          model_params,
          lr=self.train_conf.lr,
          momentum=self.train_conf.momentum,
          weight_decay=self.train_conf.wd
          )      
    elif self.train_conf.optimizer == 'Adam':
      optimizer = optim.Adam(
        model_params, 
        lr=self.train_conf.lr, 
        weight_decay=self.train_conf.wd
        )
    else:
      raise ValueError("Non-supported optimizer!")

    # lr_scheduler = optim.lr_scheduler.MultiStepLR(
    #     optimizer,
    #     milestones=self.train_conf.lr_decay_epoch,
    #     gamma=self.train_conf.lr_decay)

    # reset gradient
    # for i, p in enumerate(model.parameters()):
    #     logger.info("{}: {}".format(i, p))
    # print("-"*80)
    #criterion = nn.NLLLoss()
    criterion = nn.CrossEntropyLoss()

    # resume training
    resume_epoch = 0
    if self.train_conf.is_resume:
      resume_epoch = self.train_conf.resume_epoch
      model_file = os.path.join(self.train_conf.resume_dir,
                                self.train_conf.resume_model)
      obj = load_model(
          model.module if self.use_gpu else model,
          model_file,
          self.device,
          optimizer=optimizer
          )
      
      if self.use_gpu:
        model.module = obj['model']
      else:
        model = obj['model']

      optimizer = obj['optimizer']
      scheduler = obj['scheduler']

     
    results = defaultdict(list)
    
    for epoch in range(resume_epoch, self.train_conf.max_epoch):
      model.train()

      train_iterator = train_loader.__iter__()
      if epoch == 0:
        iter_length = len(train_iterator)
        logger.debug("Length of train loader: {}".format(iter_length))  

      avg_train_loss = .0   
      iter_count = 0  
      for _, (inp, target, ext) in enumerate(train_iterator):
        
        model.module.zero_grad()
        optimizer.zero_grad()

        iter_count += 1
        loss = .0
         
        input_tensor = inp.pin_memory().to(0, non_blocking=True)          
        target_tensor = target.pin_memory().to(0, non_blocking=True)
        ext_tensor = ext.pin_memory().to(0, non_blocking=True)
        hidden = torch.cat([model.module.initHidden().pin_memory().to(0,non_blocking=True) for _ in range(input_tensor.size(0))], dim=1)

        output, hidden = model(ext_tensor, input_tensor, hidden)

        for batch in range(output.size(0)):
          l = criterion(output[batch], target_tensor[batch])
          loss += l
        avg_train_loss += float(loss.item()) / output.size(0)

        loss.backward()
        optimizer.step()
        #lr_scheduler.step()

        if iter_count % self.train_conf.display_iter == 0 and iter_count > 1:
          avg_train_loss /= self.train_conf.display_iter
          results['train_loss'] += [avg_train_loss]
          results['train_step'] += [iter_count]

          
          logger.info("Loss @ epoch {:04d} iteration {:08d} = {}".format(epoch + 1, iter_count, avg_train_loss))

          
      #if iter_count % self.train_conf.display_code_iter == 0 and iter_count > 0:
      #Look at only the first one
      choice = random.choice(range(output.size(0)))
      file_type = self.file_ext[ext_tensor[choice].squeeze().detach().item()]
      target_char = self.tochar(target_tensor[choice])
      predict_char = self.tochar(torch.argmax(output[choice], dim=1))
      logger.info("Epoch {} Iter {} | Sample Start ----------------------".format(epoch, iter_count))
      logger.info("File Type: {}".format(file_type))
      logger.info("Predict: {}".format(''.join(predict_char)))
      logger.info("Target : {}".format(''.join(target_char)))
      logger.info("--------------------------------------------------------")
      #logger.info("output: {}".format(output[0]))

        # snapshot model
      if epoch % self.train_conf.snapshot_epoch == 0:
        logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1))
        
        snapshot(model.module, optimizer, self.config, epoch + 1)
        

    pickle.dump(results, open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb'))
    
    return 1
Example #2
0
    def train(self):
        # create data loader
        train_dataset = Citation(self.dataset_conf.path,
                                 feat_dim_pca=self.model_conf.feat_dim,
                                 dataset_name=self.dataset_conf.name,
                                 split='train',
                                 train_ratio=self.dataset_conf.train_ratio,
                                 use_rand_split=self.dataset_conf.rand_split,
                                 seed=self.config.seed)
        val_dataset = Citation(self.dataset_conf.path,
                               feat_dim_pca=self.model_conf.feat_dim,
                               dataset_name=self.dataset_conf.name,
                               split='val',
                               train_ratio=self.dataset_conf.train_ratio,
                               use_rand_split=self.dataset_conf.rand_split,
                               seed=self.config.seed)
        train_loader = DataLoader(train_dataset,
                                  batch_size=self.train_conf.batch_size,
                                  shuffle=self.train_conf.shuffle,
                                  num_workers=self.train_conf.num_workers,
                                  drop_last=False)
        val_loader = DataLoader(val_dataset,
                                batch_size=self.train_conf.batch_size,
                                shuffle=False,
                                num_workers=self.train_conf.num_workers,
                                drop_last=False)

        # create models
        model = eval(self.model_conf.name)(self.config)

        # create optimizer
        params = model.parameters()
        if self.train_conf.optimizer == 'SGD':
            optimizer = optim.SGD(params,
                                  lr=self.train_conf.lr,
                                  momentum=self.train_conf.momentum,
                                  weight_decay=self.train_conf.wd)
        elif self.train_conf.optimizer == 'Adam':
            optimizer = optim.Adam(params,
                                   lr=self.train_conf.lr,
                                   weight_decay=self.train_conf.wd)
        else:
            raise ValueError("Non-supported optimizer!")

        lr_scheduler = optim.lr_scheduler.MultiStepLR(
            optimizer,
            milestones=self.train_conf.lr_decay_steps,
            gamma=self.train_conf.lr_decay)

        # reset gradient
        optimizer.zero_grad()

        # resume training
        if self.train_conf.is_resume:
            load_model(model,
                       self.train_conf.resume_model,
                       optimizer=optimizer)

        if self.use_gpu:
            model = nn.DataParallel(model, device_ids=self.gpus).cuda()

        # Training Loop
        iter_count = 0
        best_val_acc = .0
        results = defaultdict(list)
        for epoch in range(self.train_conf.max_epoch):
            # validation
            if (epoch + 1) % self.train_conf.valid_epoch == 0 or epoch == 0:
                model.eval()
                val_loss = []
                total, correct = .0, .0
                for node_feat, node_label, edge, mask in val_loader:
                    if self.use_gpu:
                        node_feat, node_label, edge, mask = node_feat.cuda(
                        ), node_label.cuda(), edge.cuda(), mask.cuda()

                    node_feat, node_label, edge, mask = node_feat.float(
                    ), node_label.long(), edge.long(), mask.byte()

                    node_logit, node_label, _, curr_loss, _ = model(
                        edge, node_feat, target=node_label, mask=mask)
                    val_loss += [float(curr_loss.data.cpu().numpy())]
                    _, predicted = torch.max(node_logit.data, 1)
                    total += node_label.size(0)
                    correct += predicted.eq(
                        node_label.data).cpu().numpy().sum()

                val_loss = float(np.mean(val_loss))
                val_acc = 100.0 * correct / total

                # save best model
                if val_acc > best_val_acc:
                    best_val_acc = val_acc
                    snapshot(model,
                             optimizer,
                             self.config,
                             epoch + 1,
                             tag='best')

                logger.info("Avg. Validation Loss = {}".format(val_loss))
                logger.info("Validation Accuracy = {}".format(val_acc))
                logger.info("Current Best Validation Accuracy = {}".format(
                    best_val_acc))
                results['val_loss'] += [val_loss]
                results['val_acc'] += [val_acc]
                model.train()

            # training
            lr_scheduler.step()
            for node_feat, node_label, edge, mask in train_loader:
                if self.use_gpu:
                    node_feat, node_label, edge, mask = node_feat.cuda(
                    ), node_label.cuda(), edge.cuda(), mask.cuda()

                node_feat, node_label, edge, mask = node_feat.float(
                ), node_label.long(), edge.long(), mask.byte()
                # optimizer.zero_grad()

                node_logit, _, diff_norm, train_loss, grad_w = model(
                    edge, node_feat, target=node_label, mask=mask)

                # assign gradient
                for pp, ww in zip(model.parameters(), grad_w):
                    pp.grad = ww

                optimizer.step()
                train_loss = float(train_loss.data.cpu().numpy())
                results['train_loss'] += [train_loss]
                results['train_step'] += [iter_count]

                # display loss
                if (iter_count + 1) % self.train_conf.display_iter == 0:
                    logger.info(
                        "Loss @ epoch {:04d} iteration {:08d} = {}".format(
                            epoch + 1, iter_count + 1, train_loss))
                    tmp_key = 'diff_norm_{}'.format(iter_count + 1)
                    results[tmp_key] = diff_norm.data.cpu().numpy().tolist()

                iter_count += 1

            # snapshot model
            if (epoch + 1) % self.train_conf.snapshot_epoch == 0:
                logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1))
                snapshot(model.module if self.use_gpu else model, optimizer,
                         self.config, epoch + 1)

        results['best_val_acc'] += [best_val_acc]
        pickle.dump(
            results,
            open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb'))

        return best_val_acc
Example #3
0
    def train(self):
        ### create data loader
        train_dataset = eval(self.dataset_conf.loader_name)(self.config,
                                                            self.graphs_train,
                                                            tag='train')
        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=self.train_conf.batch_size,
            shuffle=self.train_conf.shuffle,
            num_workers=self.train_conf.num_workers,
            collate_fn=train_dataset.collate_fn,
            drop_last=False)

        # create models
        model = eval(self.model_conf.name)(self.config)

        if self.use_gpu:
            model = DataParallel(model, device_ids=self.gpus).to(self.device)

        # create optimizer
        params = filter(lambda p: p.requires_grad, model.parameters())
        if self.train_conf.optimizer == 'SGD':
            optimizer = optim.SGD(params,
                                  lr=self.train_conf.lr,
                                  momentum=self.train_conf.momentum,
                                  weight_decay=self.train_conf.wd)
        elif self.train_conf.optimizer == 'Adam':
            optimizer = optim.Adam(params,
                                   lr=self.train_conf.lr,
                                   weight_decay=self.train_conf.wd)
        else:
            raise ValueError("Non-supported optimizer!")

        early_stop = EarlyStopper([0.0], win_size=100, is_decrease=False)
        lr_scheduler = optim.lr_scheduler.MultiStepLR(
            optimizer,
            milestones=self.train_conf.lr_decay_epoch,
            gamma=self.train_conf.lr_decay)

        # reset gradient
        optimizer.zero_grad()

        # resume training
        resume_epoch = 0
        if self.train_conf.is_resume:
            model_file = os.path.join(self.train_conf.resume_dir,
                                      self.train_conf.resume_model)
            load_model(model.module if self.use_gpu else model,
                       model_file,
                       self.device,
                       optimizer=optimizer,
                       scheduler=lr_scheduler)
            resume_epoch = self.train_conf.resume_epoch

        # Training Loop
        iter_count = 0
        results = defaultdict(list)
        for epoch in range(resume_epoch, self.train_conf.max_epoch):
            model.train()
            lr_scheduler.step()
            train_iterator = train_loader.__iter__()

            for inner_iter in range(len(train_loader) // self.num_gpus):
                optimizer.zero_grad()

                batch_data = []
                if self.use_gpu:
                    for _ in self.gpus:
                        data = train_iterator.next()
                        batch_data.append(data)
                        iter_count += 1

                avg_train_loss = .0
                for ff in range(self.dataset_conf.num_fwd_pass):
                    batch_fwd = []

                    if self.use_gpu:
                        for dd, gpu_id in enumerate(self.gpus):
                            data = {}
                            data['adj'] = batch_data[dd][ff]['adj'].pin_memory(
                            ).to(gpu_id, non_blocking=True)
                            data['edges'] = batch_data[dd][ff][
                                'edges'].pin_memory().to(gpu_id,
                                                         non_blocking=True)
                            data['node_idx_gnn'] = batch_data[dd][ff][
                                'node_idx_gnn'].pin_memory().to(
                                    gpu_id, non_blocking=True)
                            data['node_idx_feat'] = batch_data[dd][ff][
                                'node_idx_feat'].pin_memory().to(
                                    gpu_id, non_blocking=True)
                            data['label'] = batch_data[dd][ff][
                                'label'].pin_memory().to(gpu_id,
                                                         non_blocking=True)
                            data['att_idx'] = batch_data[dd][ff][
                                'att_idx'].pin_memory().to(gpu_id,
                                                           non_blocking=True)
                            data['subgraph_idx'] = batch_data[dd][ff][
                                'subgraph_idx'].pin_memory().to(
                                    gpu_id, non_blocking=True)
                            batch_fwd.append((data, ))

                    if batch_fwd:
                        train_loss = model(*batch_fwd).mean()
                        avg_train_loss += train_loss

                        # assign gradient
                        train_loss.backward()

                # clip_grad_norm_(model.parameters(), 5.0e-0)
                optimizer.step()
                avg_train_loss /= float(self.dataset_conf.num_fwd_pass)

                # reduce
                train_loss = float(avg_train_loss.data.cpu().numpy())

                self.writer.add_scalar('train_loss', train_loss, iter_count)
                results['train_loss'] += [train_loss]
                results['train_step'] += [iter_count]

                if iter_count % self.train_conf.display_iter == 0 or iter_count == 1:
                    logger.info(
                        "NLL Loss @ epoch {:04d} iteration {:08d} = {}".format(
                            epoch + 1, iter_count, train_loss))

            # snapshot model
            if (epoch + 1) % self.train_conf.snapshot_epoch == 0:
                logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1))
                snapshot(model.module if self.use_gpu else model,
                         optimizer,
                         self.config,
                         epoch + 1,
                         scheduler=lr_scheduler)

        pickle.dump(
            results,
            open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb'))
        self.writer.close()

        return 1
Example #4
0
    def train(self):
        # create data loader
        train_dataset = BinaryMNIST(self.dataset_conf.path,
                                    num_imgs=self.dataset_conf.num_imgs,
                                    train=True,
                                    transform=transforms.ToTensor(),
                                    download=True)
        val_dataset = BinaryMNIST(self.dataset_conf.path,
                                  num_imgs=self.dataset_conf.num_imgs,
                                  train=False,
                                  transform=transforms.ToTensor(),
                                  download=True)

        train_loader = DataLoader(train_dataset,
                                  batch_size=self.train_conf.batch_size,
                                  shuffle=self.train_conf.shuffle,
                                  num_workers=self.train_conf.num_workers,
                                  drop_last=False)
        val_loader = DataLoader(val_dataset,
                                batch_size=self.train_conf.batch_size,
                                shuffle=False,
                                num_workers=self.train_conf.num_workers,
                                drop_last=False)

        # create models
        model = eval(self.model_conf.name)(self.config)

        # create optimizer
        params = model.parameters()
        if self.train_conf.optimizer == 'SGD':
            optimizer = optim.SGD(params,
                                  lr=self.train_conf.lr,
                                  momentum=self.train_conf.momentum,
                                  weight_decay=self.train_conf.wd)
        elif self.train_conf.optimizer == 'Adam':
            optimizer = optim.Adam(params,
                                   lr=self.train_conf.lr,
                                   weight_decay=self.train_conf.wd)
        else:
            raise ValueError("Non-supported optimizer!")

        lr_scheduler = optim.lr_scheduler.MultiStepLR(
            optimizer,
            milestones=self.train_conf.lr_decay_steps,
            gamma=self.train_conf.lr_decay)

        # reset gradient
        optimizer.zero_grad()

        # resume training
        if self.train_conf.is_resume:
            load_model(model,
                       self.train_conf.resume_model,
                       optimizer=optimizer)

        if self.use_gpu:
            model = nn.DataParallel(model, device_ids=self.gpus).cuda()

        # Training Loop
        iter_count = 0
        results = defaultdict(list)
        for epoch in range(self.train_conf.max_epoch):
            # validation
            if (epoch + 1) % self.train_conf.valid_epoch == 0 or epoch == 0:
                model.eval()
                val_loss = []
                val_counter = 0
                for imgs, labels in val_loader:
                    if self.use_gpu:
                        imgs, labels = imgs.cuda(), labels.cuda()

                    imgs, labels = imgs.float(), labels.float()
                    imgs_corrupt = self.rand_corrupt(
                        imgs, corrupt_level=self.dataset_conf.corrupt_level)
                    curr_loss, imgs_memory, _, _ = model(imgs_corrupt)
                    img_recover = imgs_memory[-self.model_conf.input_dim:]
                    img_recover_show = img_recover.clone().detach()
                    img_recover_show.requires_grad = False
                    img_recover_show[img_recover_show >= 0.5] = 1.0
                    img_recover_show[img_recover_show < 0.5] = 0.0
                    val_loss += [float(curr_loss.data.cpu().numpy())]
                    val_counter += 1

                val_loss = float(np.mean(val_loss))
                logger.info("Avg. Validation Loss = {}".format(
                    np.log10(val_loss)))
                results['val_loss'] += [val_loss]
                model.train()

            # training
            lr_scheduler.step()
            for imgs, labels in train_loader:
                if self.use_gpu:
                    imgs, labels = imgs.cuda(), labels.cuda()

                imgs, labels = imgs.float(), labels.float()
                optimizer.zero_grad()
                train_loss, imgs_memory, diff_norm, grad = model(imgs)

                for pp, ww in zip(model.parameters(), grad):
                    pp.grad = ww

                optimizer.step()
                train_loss = float(train_loss.data.cpu().numpy())
                results['train_loss'] += [train_loss]
                results['train_step'] += [iter_count]

                # display loss
                if iter_count % self.train_conf.display_iter == 0:
                    logger.info(
                        "Loss @ epoch {:04d} iteration {:08d} = {}".format(
                            epoch + 1, iter_count + 1, np.log10(train_loss)))

                    tmp_key = 'diff_norm_{}'.format(iter_count + 1)
                    results[tmp_key] = diff_norm

                iter_count += 1

            # snapshot model
            if (epoch + 1) % self.train_conf.snapshot_epoch == 0:
                logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1))
                snapshot(model.module if self.use_gpu else model, optimizer,
                         self.config, epoch + 1)

        pickle.dump(
            results,
            open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb'))
Example #5
0
  def train(self):
    # create data loader
    train_dataset = eval(self.dataset_conf.loader_name)(
        self.config, split='train')
    dev_dataset = eval(self.dataset_conf.loader_name)(self.config, split='dev')
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=self.train_conf.batch_size,
        shuffle=self.train_conf.shuffle,
        num_workers=self.train_conf.num_workers,
        collate_fn=train_dataset.collate_fn,
        drop_last=False)
    subset_indices = range(self.subsample_size)
    train_loader_sub = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=self.train_conf.batch_size,
        shuffle=False,
        num_workers=self.train_conf.num_workers,
        collate_fn=train_dataset.collate_fn,
        drop_last=False,
        sampler=SubsetRandomSampler(subset_indices))
    dev_loader_sub = torch.utils.data.DataLoader(
          dev_dataset,
          batch_size=self.train_conf.batch_size,
          shuffle=False,
          num_workers=self.train_conf.num_workers,
          collate_fn=dev_dataset.collate_fn,
          drop_last=False,
          sampler=SubsetRandomSampler(subset_indices))

    # create models
    model = eval(self.model_conf.name)(self.model_conf)

    if self.use_gpu:
      model = nn.DataParallel(model, device_ids=self.gpus).cuda()

    # create optimizer
    params = filter(lambda p: p.requires_grad, model.parameters())
    if self.train_conf.optimizer == 'SGD':
      optimizer = optim.SGD(
          params,
          lr=self.train_conf.lr,
          momentum=self.train_conf.momentum,
          weight_decay=self.train_conf.wd)
    elif self.train_conf.optimizer == 'Adam':
      optimizer = optim.Adam(
          params, lr=self.train_conf.lr, weight_decay=self.train_conf.wd)
    else:
      raise ValueError("Non-supported optimizer!")

    early_stop = EarlyStopper([0.0], win_size=10, is_decrease=False)

    lr_scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=self.warmup_setps)

    # reset gradient
    optimizer.zero_grad()

    # resume training or use prxetrained model
    if self.train_conf.is_resume:
      if self.train_conf.pretrain:
        model_snapshot = torch.load(self.train_conf.resume_model,map_location=self.device)
        model.load_state_dict(model_snapshot["model"],strict=False)
        model.to(self.device)
      else:
        model_snapshot = torch.load(self.train_conf.resume_model,map_location=self.device)
        model.load_state_dict(model_snapshot["model"],strict=True)
        model.to(self.device)

    # Training Loop
    num_train = len(train_dataset)
    iter_count = 0
    best_val_loss = np.inf
    best_val_loss_test = np.inf
    best_win_pct_val = 0
    best_win_pct_val_test = 0

    results = defaultdict(list)
    for epoch in range(self.train_conf.max_epoch):

      # --------------------------------validation---------------------------------------------
      if (epoch + 1) % self.train_conf.valid_epoch == 0 or epoch == 0:

        #calculate validation loss
        model.eval()
        with torch.no_grad():
          result_dataset_val = self.cal_dataset_loss(model,dev_loader_sub)

        if self.is_val:
          logger.info("-----------------Avg. Validation Loss = {:.4f}, "
          "NMLL = {:.4f}, NMLL_opt = {:.4f}, Win_pct = {:.2f}%, "
          "NMLL_test = {:.4f}, NMLL_test_opt = {:.4f}, "
          "Win_pct_test = {:.2f}%--------------------".format(
            result_dataset_val['loss'], 
            result_dataset_val['nmll'], result_dataset_val['nmll_opt_sm'],
            result_dataset_val['win_pct_ai']*100, 
            result_dataset_val['nmll_test'], result_dataset_val['nmll_opt_sm_test'],
            result_dataset_val['win_pct_ai_test']*100))
          self.writer.add_scalar('nmll_opt_val', result_dataset_val['nmll_opt_sm'], iter_count)
          self.writer.add_scalar('nmll_opt_test_val', result_dataset_val['nmll_opt_sm_test'], iter_count)
          self.writer.add_scalar('win_pct_ai_val', result_dataset_val['win_pct_ai'], iter_count)
          self.writer.add_scalar('win_pct_ai_test_val', result_dataset_val['win_pct_ai_test'], iter_count)
        else:
          logger.info("-----------------Avg. Validation Loss = {:.4f}, "
            "NMLL = {:.4f}, NMLL_orig = {:.4f}, Win_pct = {:.2f}%, "
            "NMLL_test = {:.4f}, NMLL_test_orig = {:.4f}, "
            "Win_pct_test = {:.2f}%--------------------".format(
              result_dataset_val['loss'], 
              result_dataset_val['nmll'], result_dataset_val['nmll_orig'],
              result_dataset_val['win_pct']*100, 
              result_dataset_val['nmll_test'], result_dataset_val['nmll_test_orig'],
              result_dataset_val['win_pct_test']*100))

        self.writer.add_scalar('val_loss', result_dataset_val['loss'], iter_count)
        self.writer.add_scalar('nmll_loss_val', result_dataset_val['nmll'], iter_count)
        self.writer.add_scalar('nmll_loss_orig_val', result_dataset_val['nmll_orig'], iter_count)
        self.writer.add_scalar('nmll_loss_test_val', result_dataset_val['nmll_test'], iter_count)
        self.writer.add_scalar('nmll_loss_test_orig_val', result_dataset_val['nmll_test_orig'], iter_count)
        self.writer.add_scalar('win_pct_val', result_dataset_val['win_pct'], iter_count)
        self.writer.add_scalar('win_pct_val_test', result_dataset_val['win_pct_test'], iter_count)
        results['val_loss'] += [result_dataset_val['loss']]
        results['nmll_loss_val'] += [result_dataset_val['nmll']]
        results['nmll_loss_orig_val'] += [result_dataset_val['nmll_orig']]
        results['nmll_loss_test_val'] += [result_dataset_val['nmll_test']]
        results['nmll_loss_test_orig_val'] += [result_dataset_val['nmll_test_orig']]
        results['win_pct_val'] += [result_dataset_val['win_pct']]
        results['win_pct_val_test'] += [result_dataset_val['win_pct_test']]

        # save best model
        if result_dataset_val['loss'] < best_val_loss:
          best_val_loss = result_dataset_val['loss']
          best_val_loss_test = result_dataset_val['nmll_test']
          if self.is_val:
            best_win_pct_val = result_dataset_val['win_pct_ai']
            best_win_pct_val_test = result_dataset_val['win_pct_ai_test']
          else:
            best_win_pct_val = result_dataset_val['win_pct']
            best_win_pct_val_test = result_dataset_val['win_pct_test']
          snapshot(
              model.module if self.use_gpu else model,
              optimizer,
              self.config,
              epoch + 1,
              tag='best')

        logger.info("Current Best Validation Loss = {:.4f}".format(best_val_loss))

        # check early stop
        if early_stop.tick([result_dataset_val['loss']]):
          snapshot(
              model.module if self.use_gpu else model,
              optimizer,
              self.config,
              epoch + 1,
              tag='last')
          self.writer.close()
          break

      # --------------------------------------training-----------------------------------
      model.train()
      for data in train_loader:
        optimizer.zero_grad()

        if self.use_gpu:
          data['max_node_size'],data['X_data_tr'],data['X_data_val'],data['X_data_test'],data['F_tr'],data['F_val'],data['F_test'],data['N_val'],data['kernel_mask_val'],data['diagonal_mask_val'],data['N_test'],data['kernel_mask_test'],data['diagonal_mask_test'],data['node_mask_tr'],data['dim_mask'], data['nmll'], data['dim_size'] = data_to_gpu(
                data['max_node_size'],data['X_data_tr'],data['X_data_val'],data['X_data_test'],data['F_tr'],data['F_val'],data['F_test'],data['N_val'],data['kernel_mask_val'],data['diagonal_mask_val'],data['N_test'],data['kernel_mask_test'],data['diagonal_mask_test'],data['node_mask_tr'],data['dim_mask'], data['nmll'], data['dim_size'])

        if self.model_conf.name == 'GpSMDoubleAtt':
          mu, var, weights, nmll, nmll_test = model(data['X_data_tr'],data['X_data_val'],data['F_tr'],data['F_val'],data['node_mask_tr'],data['dim_mask'],data['kernel_mask_val'],data['diagonal_mask_val'],data['N_val'],device = self.device,eval_mode = True,X_data_test = data['X_data_test'],F_data_test = data['F_test'],kernel_mask_test=data['kernel_mask_test'],diagonal_mask_test=data['diagonal_mask_test'],N_data_test=data['N_test'])
        elif self.model_conf.name == 'GpSMDoubleAttNoMu':
          var, weights, nmll, nmll_test = model(data['X_data_tr'],data['X_data_val'],data['F_tr'],data['F_val'],data['node_mask_tr'],data['dim_mask'],data['kernel_mask_val'],data['diagonal_mask_val'],data['N_val'],device = self.device,eval_mode = True,X_data_test = data['X_data_test'],F_data_test = data['F_test'],kernel_mask_test=data['kernel_mask_test'],diagonal_mask_test=data['diagonal_mask_test'],N_data_test=data['N_test'])
        else:
          raise ValueError("No model of given name!")
        # print("Outside: input size", data['X_data'].shape, "output_size", nmll.shape)

        nmll_orig = data['nmll']
        win_pct_train = torch.sum(nmll<nmll_orig+0.01).float()/nmll.shape[0]

        data_dim_vec = data['X_data_tr'].shape[-1]
        nmll_loss_train = torch.mean(nmll)

        train_loss = nmll_loss_train

        # calculate gradient
        train_loss.backward()

        nmll_loss_orig = torch.mean(nmll_orig)

        # calculate gradient norm
        grad_norm = 0
        for p in model.parameters():
          if p.requires_grad:
            param_norm = p.grad.data.norm()
            grad_norm += param_norm.item() ** 2
        grad_norm = grad_norm ** (1./2)
        nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()

        train_loss = float(train_loss.data.cpu().numpy())
        nmll_loss_train = float(nmll_loss_train.data.cpu().numpy())
        nmll_loss_train_orig = float(nmll_loss_orig.data.cpu().numpy())
        win_pct_train = float(win_pct_train.data.cpu().numpy())


        self.writer.add_scalar('train_loss', train_loss, iter_count)
        self.writer.add_scalar('nmll_loss_train', nmll_loss_train, iter_count)
        self.writer.add_scalar('nmll_loss_train_orig', nmll_loss_train_orig, iter_count)
        self.writer.add_scalar('win_pct_train', win_pct_train, iter_count)
        self.writer.add_scalar('grad_norm', grad_norm, iter_count)

        results['nmll_loss_train'] += [nmll_loss_train]
        results['nmll_loss_train_orig'] += [nmll_loss_train_orig]
        results['train_loss'] += [train_loss]
        results['win_pct_train'] += [win_pct_train]
        results['train_step'] += [iter_count]
        results['grad_norm'] += [grad_norm]

        # display loss
        if (iter_count + 1) % self.train_conf.display_iter == 0:
          logger.info("Loss @ epoch {:04d} iteration {:08d} = {:.4f}, NMLL = {:.4f}, NMLL_orig = {:.4f}, Win_pct = {:.2f}%, Grad_norm = {:.4f}, LR = {:.2e}".format(
              epoch + 1, iter_count + 1, train_loss, nmll_loss_train, nmll_loss_train_orig, win_pct_train*100, grad_norm, get_lr(optimizer)))

        iter_count += 1

      # snapshot model
      if (epoch + 1) % self.train_conf.snapshot_epoch == 0:
        logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1))
        snapshot(model.module
                 if self.use_gpu else model, optimizer, self.config, epoch + 1)

      lr_scheduler.step()


    #look at predictions, for debug purpose
    model.eval()
    with torch.no_grad():
      results_sample_tr = self.cal_sample_result(model,train_loader_sub)
      results_sample_dev = self.cal_sample_result(model,dev_loader_sub)
      result_dataset_tr = self.cal_dataset_loss(model,train_loader_sub)
      result_dataset_dev = self.cal_dataset_loss(model,dev_loader_sub)

    
    train_loss = result_dataset_tr['loss']
    results['best_val_loss'] = best_val_loss
    results['win_count_tr'] = results_sample_tr['win_pct']
    results['win_count_dev'] = results_sample_dev['win_pct']
    results['nmll_loss_sample_tr'] = results_sample_tr['nmll_loss_sample']
    results['nmll_loss_sample_dev'] = results_sample_dev['nmll_loss_sample']
    pickle.dump(results,
                open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb'))
    self.writer.close()
    logger.info("Best Validation Loss = {:.4f}, "
      "Best Win_pct_val = {:.2f}%, " 
      "Best Val Loss on Test = {:.4f}, "
      "Best Win_pct_val_test = {:.2f}%, "
      "Final Training NMLL = {:.4f}, "
      "Training NMLL original = {:.4f}, "
      "Win_pct_train = {:.2f}%, "
      "Final Dev NMLL = {:.4f}, "
      "Dev NMLL original = {:.4f}, "
      "Win_pct_dev = {:.2f}%, "
      "Final Dev Test NMLL = {:.4f}, "
      "Dev Test NMLL original = {:.4f}, "
      "Win_pct_test_dev = {:.2f}%.".format(
        best_val_loss, \
        best_win_pct_val*100, \
        best_val_loss_test, \
        best_win_pct_val_test*100, \
        result_dataset_tr['nmll'], \
        result_dataset_tr['nmll_orig'], \
        result_dataset_tr['win_pct']*100, \
        result_dataset_dev['nmll'], \
        result_dataset_dev['nmll_orig'], \
        result_dataset_dev['win_pct']*100, \
        result_dataset_dev['nmll_test'], \
        result_dataset_dev['nmll_test_orig'], \
        result_dataset_dev['win_pct_test']*100))


    avg_nmll_tr = np.mean(results_sample_tr['nmll_sample_compare'],0)
    logger.info('% of GPs with higher marginal likelihood = {:.2f}%'.format(results_sample_tr['win_pct']*100))
    logger.info('Average NMLL on training samples: true = {}, learned = {}'.format(avg_nmll_tr[1],avg_nmll_tr[0]))
    avg_nmll_dev = np.mean(results_sample_dev['nmll_sample_compare'],0)
    logger.info('% of GPs with higher marginal likelihood = {:.2f}%'.format(results_sample_dev['win_pct']*100))
    logger.info('Average NMLL on testing samples: true = {}, learned = {}'.format(avg_nmll_dev[1],avg_nmll_dev[0]))
    snapshot(
        model.module if self.use_gpu else model,
        optimizer,
        self.config,
        self.train_conf.max_epoch + 1,
        tag='final')
    return None
Example #6
0
    def train(self):
        # create data loader
        train_dataset = eval(self.dataset_conf.loader_name)(self.config,
                                                            split='train')
        dev_dataset = eval(self.dataset_conf.loader_name)(self.config,
                                                          split='dev')
        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=self.train_conf.batch_size,
            shuffle=self.train_conf.shuffle,
            num_workers=self.train_conf.num_workers,
            collate_fn=train_dataset.collate_fn,
            drop_last=False)
        dev_loader = torch.utils.data.DataLoader(
            dev_dataset,
            batch_size=self.train_conf.batch_size,
            shuffle=False,
            num_workers=self.train_conf.num_workers,
            collate_fn=dev_dataset.collate_fn,
            drop_last=False)

        # create models
        model = eval(self.model_conf.name)(self.config)

        if self.use_gpu:
            model = nn.DataParallel(model, device_ids=self.gpus).cuda()

        # create optimizer
        params = filter(lambda p: p.requires_grad, model.parameters())
        if self.train_conf.optimizer == 'SGD':
            optimizer = optim.SGD(params,
                                  lr=self.train_conf.lr,
                                  momentum=self.train_conf.momentum,
                                  weight_decay=self.train_conf.wd)
        elif self.train_conf.optimizer == 'Adam':
            optimizer = optim.Adam(params,
                                   lr=self.train_conf.lr,
                                   weight_decay=self.train_conf.wd)
        else:
            raise ValueError("Non-supported optimizer!")

        early_stop = EarlyStopper([0.0], win_size=10, is_decrease=False)

        lr_scheduler = optim.lr_scheduler.MultiStepLR(
            optimizer,
            milestones=self.train_conf.lr_decay_steps,
            gamma=self.train_conf.lr_decay)

        # reset gradient
        optimizer.zero_grad()

        # resume training
        if self.train_conf.is_resume:
            load_model(model,
                       self.train_conf.resume_model,
                       optimizer=optimizer)

        # Training Loop
        iter_count = 0
        best_val_loss = np.inf
        results = defaultdict(list)
        for epoch in range(self.train_conf.max_epoch):
            # validation
            if (epoch + 1) % self.train_conf.valid_epoch == 0 or epoch == 0:
                model.eval()
                val_loss = []

                for data in tqdm(dev_loader):
                    if self.use_gpu:
                        data['node_feat'], data['node_mask'], data[
                            'label'] = data_to_gpu(data['node_feat'],
                                                   data['node_mask'],
                                                   data['label'])

                        if self.model_conf.name == 'LanczosNet':
                            data['L'], data['D'], data['V'] = data_to_gpu(
                                data['L'], data['D'], data['V'])
                        elif self.model_conf.name == 'GraphSAGE':
                            data['nn_idx'], data[
                                'nonempty_mask'] = data_to_gpu(
                                    data['nn_idx'], data['nonempty_mask'])
                        elif self.model_conf.name == 'GPNN':
                            data['L'], data['L_cluster'], data[
                                'L_cut'] = data_to_gpu(data['L'],
                                                       data['L_cluster'],
                                                       data['L_cut'])
                        else:
                            data['L'] = data_to_gpu(data['L'])[0]

                    with torch.no_grad():
                        if self.model_conf.name == 'AdaLanczosNet':
                            pred, _ = model(data['node_feat'],
                                            data['L'],
                                            label=data['label'],
                                            mask=data['node_mask'])
                        elif self.model_conf.name == 'LanczosNet':
                            pred, _ = model(data['node_feat'],
                                            data['L'],
                                            data['D'],
                                            data['V'],
                                            label=data['label'],
                                            mask=data['node_mask'])
                        elif self.model_conf.name == 'GraphSAGE':
                            pred, _ = model(data['node_feat'],
                                            data['nn_idx'],
                                            data['nonempty_mask'],
                                            label=data['label'],
                                            mask=data['node_mask'])
                        elif self.model_conf.name == 'GPNN':
                            pred, _ = model(data['node_feat'],
                                            data['L'],
                                            data['L_cluster'],
                                            data['L_cut'],
                                            label=data['label'],
                                            mask=data['node_mask'])
                        else:
                            pred, _ = model(data['node_feat'],
                                            data['L'],
                                            label=data['label'],
                                            mask=data['node_mask'])

                    curr_loss = (pred - data['label']
                                 ).abs().cpu().numpy() * self.const_factor
                    val_loss += [curr_loss]

                val_loss = float(np.mean(np.concatenate(val_loss)))
                logger.info("Avg. Validation MAE = {}".format(val_loss))
                self.writer.add_scalar('val_loss', val_loss, iter_count)
                results['val_loss'] += [val_loss]

                # save best model
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    snapshot(model.module if self.use_gpu else model,
                             optimizer,
                             self.config,
                             epoch + 1,
                             tag='best')

                logger.info(
                    "Current Best Validation MAE = {}".format(best_val_loss))

                # check early stop
                if early_stop.tick([val_loss]):
                    snapshot(model.module if self.use_gpu else model,
                             optimizer,
                             self.config,
                             epoch + 1,
                             tag='last')
                    self.writer.close()
                    break

            # training
            model.train()
            lr_scheduler.step()
            for data in train_loader:
                optimizer.zero_grad()

                if self.use_gpu:
                    data['node_feat'], data['node_mask'], data[
                        'label'] = data_to_gpu(data['node_feat'],
                                               data['node_mask'],
                                               data['label'])

                    if self.model_conf.name == 'LanczosNet':
                        data['L'], data['D'], data['V'] = data_to_gpu(
                            data['L'], data['D'], data['V'])
                    elif self.model_conf.name == 'GraphSAGE':
                        data['nn_idx'], data['nonempty_mask'] = data_to_gpu(
                            data['nn_idx'], data['nonempty_mask'])
                    elif self.model_conf.name == 'GPNN':
                        data['L'], data['L_cluster'], data[
                            'L_cut'] = data_to_gpu(data['L'],
                                                   data['L_cluster'],
                                                   data['L_cut'])
                    else:
                        data['L'] = data_to_gpu(data['L'])[0]

                if self.model_conf.name == 'AdaLanczosNet':
                    _, train_loss = model(data['node_feat'],
                                          data['L'],
                                          label=data['label'],
                                          mask=data['node_mask'])
                elif self.model_conf.name == 'LanczosNet':
                    _, train_loss = model(data['node_feat'],
                                          data['L'],
                                          data['D'],
                                          data['V'],
                                          label=data['label'],
                                          mask=data['node_mask'])
                elif self.model_conf.name == 'GraphSAGE':
                    _, train_loss = model(data['node_feat'],
                                          data['nn_idx'],
                                          data['nonempty_mask'],
                                          label=data['label'],
                                          mask=data['node_mask'])
                elif self.model_conf.name == 'GPNN':
                    _, train_loss = model(data['node_feat'],
                                          data['L'],
                                          data['L_cluster'],
                                          data['L_cut'],
                                          label=data['label'],
                                          mask=data['node_mask'])
                else:
                    _, train_loss = model(data['node_feat'],
                                          data['L'],
                                          label=data['label'],
                                          mask=data['node_mask'])

                # assign gradient
                train_loss.backward()
                optimizer.step()
                train_loss = float(train_loss.data.cpu().numpy())
                self.writer.add_scalar('train_loss', train_loss, iter_count)
                results['train_loss'] += [train_loss]
                results['train_step'] += [iter_count]

                # display loss
                if (iter_count + 1) % self.train_conf.display_iter == 0:
                    logger.info(
                        "Loss @ epoch {:04d} iteration {:08d} = {}".format(
                            epoch + 1, iter_count + 1, train_loss))

                iter_count += 1

            # snapshot model
            if (epoch + 1) % self.train_conf.snapshot_epoch == 0:
                logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1))
                snapshot(model.module if self.use_gpu else model, optimizer,
                         self.config, epoch + 1)

        results['best_val_loss'] += [best_val_loss]
        pickle.dump(
            results,
            open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb'))
        self.writer.close()
        logger.info("Best Validation MAE = {}".format(best_val_loss))

        return best_val_loss
    def train(self):
        torch.autograd.set_detect_anomaly(True)

        ### create data loader
        train_dataset = eval(self.dataset_conf.loader_name)(self.config,
                                                            self.graphs_train,
                                                            tag='train')
        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=self.train_conf.batch_size,
            shuffle=self.train_conf.shuffle,  # true for grid
            num_workers=self.train_conf.num_workers,
            collate_fn=train_dataset.collate_fn,
            drop_last=False)

        # create models
        model = eval(self.model_conf.name)(self.config)
        criterion = nn.BCEWithLogitsLoss()

        if self.use_gpu:
            model = DataParallel(model, device_ids=self.gpus).to(self.device)
            criterion = criterion.cuda()
        model.train()

        # create optimizer
        params = filter(lambda p: p.requires_grad, model.parameters())
        if self.train_conf.optimizer == 'SGD':
            optimizer = optim.SGD(params,
                                  lr=self.train_conf.lr,
                                  momentum=self.train_conf.momentum,
                                  weight_decay=self.train_conf.wd)
        elif self.train_conf.optimizer == 'Adam':
            optimizer = optim.Adam(params,
                                   lr=self.train_conf.lr,
                                   weight_decay=self.train_conf.wd)
        else:
            raise ValueError("Non-supported optimizer!")

        # TODO: not used?
        early_stop = EarlyStopper([0.0], win_size=100, is_decrease=False)
        lr_scheduler = optim.lr_scheduler.MultiStepLR(
            optimizer,
            milestones=self.train_conf.lr_decay_epoch,
            gamma=self.train_conf.lr_decay)

        # reset gradient
        optimizer.zero_grad()

        best_acc = 0.
        # resume training
        # TODO: record resume_epoch to the saved file
        resume_epoch = 0
        if self.train_conf.is_resume:
            model_file = os.path.join(self.train_conf.resume_dir,
                                      self.train_conf.resume_model)
            load_model(model.module if self.use_gpu else model,
                       model_file,
                       self.device,
                       optimizer=optimizer,
                       scheduler=lr_scheduler)
            resume_epoch = self.train_conf.resume_epoch

        # Training Loop
        iter_count = 0
        results = defaultdict(list)
        for epoch in range(resume_epoch, self.train_conf.max_epoch):
            model.train()
            train_iterator = train_loader.__iter__()

            avg_acc_whole_epoch = 0.
            cnt = 0.

            for inner_iter in range(len(train_loader) // self.num_gpus):
                optimizer.zero_grad()

                batch_data = []
                if self.use_gpu:
                    for _ in self.gpus:
                        data = train_iterator.next()
                        batch_data.append(data)
                        iter_count += 1

                avg_train_loss = .0
                avg_acc = 0.
                for ff in range(self.dataset_conf.num_fwd_pass):
                    batch_fwd = []

                    if self.use_gpu:
                        for dd, gpu_id in enumerate(self.gpus):
                            data = {}
                            data['adj'] = batch_data[dd][ff]['adj'].pin_memory(
                            ).to(gpu_id, non_blocking=True)
                            data['edges'] = batch_data[dd][ff][
                                'edges'].pin_memory().to(gpu_id,
                                                         non_blocking=True)
                            # data['node_idx_gnn'] = batch_data[dd][ff]['node_idx_gnn'].pin_memory().to(gpu_id, non_blocking=True)
                            # data['node_idx_feat'] = batch_data[dd][ff]['node_idx_feat'].pin_memory().to(gpu_id, non_blocking=True)
                            # data['label'] = batch_data[dd][ff]['label'].pin_memory().to(gpu_id, non_blocking=True)
                            # data['att_idx'] = batch_data[dd][ff]['att_idx'].pin_memory().to(gpu_id, non_blocking=True)
                            data['subgraph_idx'] = batch_data[dd][ff][
                                'subgraph_idx'].pin_memory().to(
                                    gpu_id, non_blocking=True)
                            data['complete_graph_label'] = batch_data[dd][ff][
                                'complete_graph_label'].pin_memory().to(
                                    gpu_id, non_blocking=True)
                            batch_fwd.append((data, ))

                    pred = model(*batch_fwd)
                    label = data['complete_graph_label'][:, None]
                    train_loss = criterion(pred, label).mean()
                    train_loss.backward()

                    pred = (torch.sigmoid(pred) > 0.5).type_as(label)
                    avg_acc += (pred.eq(label)).float().mean().item()

                    avg_train_loss += train_loss.item()

                    # assign gradient

                # clip_grad_norm_(model.parameters(), 5.0e-0)
                optimizer.step()
                lr_scheduler.step()
                avg_train_loss /= self.dataset_conf.num_fwd_pass  # num_fwd_pass always 1
                avg_acc /= self.dataset_conf.num_fwd_pass

                avg_acc_whole_epoch += avg_acc
                cnt += len(data['complete_graph_label'])

                # reduce
                self.writer.add_scalar('train_loss', avg_train_loss,
                                       iter_count)
                self.writer.add_scalar('train_acc', avg_acc, iter_count)
                results['train_loss'] += [avg_train_loss]
                results['train_acc'] += [avg_acc]
                results['train_step'] += [iter_count]

                # if iter_count % self.train_conf.display_iter == 0 or iter_count == 1:
                #   logger.info("NLL Loss @ epoch {:04d} iteration {:08d} = {}\tAcc = {}".format(epoch + 1, iter_count, train_loss, avg_acc))

            avg_acc_whole_epoch /= cnt
            is_new_best = avg_acc_whole_epoch > best_acc
            if is_new_best:
                logger.info('!!! New best')
                best_acc = avg_acc_whole_epoch
            logger.info("Avg acc = {} @ epoch {:04d}".format(
                avg_acc_whole_epoch, epoch + 1))

            # snapshot model
            if (epoch +
                    1) % self.train_conf.snapshot_epoch == 0 or is_new_best:
                logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1))
                snapshot(model.module if self.use_gpu else model,
                         optimizer,
                         self.config,
                         epoch + 1,
                         scheduler=lr_scheduler)

        pickle.dump(
            results,
            open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb'))
        self.writer.close()

        return 1
Example #8
0
    def train(self):
        ### create data loader
        train_dataset = eval(self.dataset_conf.loader_name)(self.config,
                                                            self.graphs_train,
                                                            tag='train')
        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=self.train_conf.batch_size,
            shuffle=self.train_conf.shuffle,  # true for grid
            num_workers=self.train_conf.num_workers,
            collate_fn=train_dataset.collate_fn,
            drop_last=False)

        # create models
        # model = eval(self.model_conf.name)(self.config)
        args = self.config.model
        n_labels = self.dataset_conf.max_m + self.dataset_conf.max_n
        G = define_G(args.nz, args.ngf, args.netG, args.final_activation,
                     args.norm_G)
        D = define_D(args.ndf, args.netD, norm=args.norm_D)

        ### define losses
        criterionGAN = GANLoss(args.gan_mode)
        rote_loss = nn.L1Loss(reduction='none')
        if args.sparsity > 0.:
            sparse_loss = nn.L1Loss()

        if self.use_gpu:
            # G = DataParallel(G).to(self.device)
            # D = DataParallel(D).to(self.device)
            G = G.cuda()
            D = D.cuda()
            criterionGAN = criterionGAN.to(self.device)
            rote_loss = rote_loss.cuda()
            if args.sparsity > 0.:
                sparse_loss = sparse_loss.cuda()

        G.train()
        D.train()

        # create optimizer
        G_params = filter(lambda p: p.requires_grad, G.parameters())
        D_params = filter(lambda p: p.requires_grad, D.parameters())
        optimizer_G = optim.Adam(G_params,
                                 lr=self.train_conf.lr,
                                 betas=(self.train_conf.beta1, 0.999))
        optimizer_D = optim.Adam(D_params,
                                 lr=self.train_conf.lr,
                                 betas=(self.train_conf.beta1, 0.999))
        fake_pool = ImagePool(args.pool_size)

        # resume training
        # TODO: record resume_epoch to the saved file
        resume_epoch = 0
        if self.train_conf.is_resume:
            model_file_G = os.path.join(self.train_conf.resume_dir,
                                        'G_' + self.train_conf.resume_model)
            model_file_D = os.path.join(self.train_conf.resume_dir,
                                        'D_' + self.train_conf.resume_model)
            load_model(G, model_file_G, self.device, optimizer=optimizer_G)
            load_model(D, model_file_D, self.device, optimizer=optimizer_D)
            resume_epoch = int(
                osp.splitext(self.train_conf.resume_model)[0].split('_')[-1])
            #original: self.train_conf.resume_epoch

        # Training Loop
        iter_count = 0  # iter idx thoughout the whole training
        results = defaultdict(list)
        for epoch in range(resume_epoch, self.train_conf.max_epoch):
            train_iterator = train_loader.__iter__()

            for batch_data in train_iterator:
                set_requires_grad(D, False)
                # set_requires_grad(G, True)
                optimizer_G.zero_grad()

                iter_count += 1
                # assert in arg helper
                ff = 0
                data = {}
                data['adj'] = batch_data[ff]['adj'].pin_memory().to(
                    self.config.device, non_blocking=True)
                data['m'] = batch_data[ff]['m'].to(self.config.device,
                                                   non_blocking=True)
                data['n'] = batch_data[ff]['n'].to(self.config.device,
                                                   non_blocking=True)

                batch_size = data['adj'].size(0)

                i_onehot = torch.zeros(
                    (batch_size, self.dataset_conf.max_m),
                    requires_grad=True).pin_memory().to(self.config.device,
                                                        non_blocking=True)
                i_onehot.scatter_(1, data['m'][:, None] - 1, 1)
                j_onehot = torch.zeros(
                    (batch_size, self.dataset_conf.max_n),
                    requires_grad=True).pin_memory().to(self.config.device,
                                                        non_blocking=True)
                j_onehot.scatter_(1, data['n'][:, None] - 1, 1)
                y_onehot = torch.cat((i_onehot, j_onehot), dim=1)

                if args.nz > n_labels:
                    noise = torch.randn(
                        (batch_size, args.nz - n_labels, 1, 1),
                        requires_grad=True).to(self.config.device,
                                               non_blocking=True)
                    z_input = torch.cat(
                        (y_onehot.view(batch_size, n_labels, 1, 1), noise),
                        dim=1)
                else:
                    z_input = y_onehot.view(batch_size, n_labels, 1, 1)

                output = G(z_input)  # (B, 1, n, n)
                if self.model_conf.is_sym:
                    output = torch.tril(output, diagonal=-1)
                    output = output + output.transpose(2, 3)

                loss_G = 0.
                if args.sparsity > 0:
                    loss_G_sparse = sparse_loss(
                        output,
                        torch.tensor(0.).expand_as(output).cuda())
                    loss_G += args.sparsity * loss_G_sparse
                if args.lambda_rote > 0:
                    if args.final_activation == 'tanh':
                        tmp_obj = (data['adj'] - 0.5) * 2
                    else:
                        tmp_obj = data['adj']
                    loss_G_rote = rote_loss(output, tmp_obj)
                    rote_mask = (loss_G_rote > 0.2).type_as(loss_G_rote)
                    loss_G_rote = (loss_G_rote * rote_mask).mean()
                    loss_G += args.lambda_rote * loss_G_rote

                # backward G

                loss_G_GAN = criterionGAN(D(output), True)
                loss_G += loss_G_GAN
                loss_G.backward()
                optimizer_G.step()

                # backward D
                set_requires_grad(D, True)
                # set_requires_grad(G, False)
                optimizer_D.zero_grad()
                real = data['adj']

                if args.final_activation == 'sigmoid':
                    ones_soft = torch.rand_like(real) * 0.1 + 0.9
                    zeros_soft = torch.rand_like(real) * 0.1
                elif args.final_activation == 'tanh':
                    ones_soft = torch.rand_like(real) * 0.2 + 0.8
                    zeros_soft = -(torch.rand_like(real) * 0.2 + 0.8)
                ones_mask = (real == 1.)
                zeros_mask = (real == 0.)
                real[ones_mask] = ones_soft[ones_mask]
                real[zeros_mask] = zeros_soft[zeros_mask]
                if self.model_conf.is_sym:
                    real = torch.tril(real, diagonal=-1)
                    real = real + real.transpose(2, 3)
                pred_real = D(real)
                loss_D_real = criterionGAN(pred_real, True)
                # Fake
                if args.pool_size:
                    queried_fake = fake_pool.query(output.detach())
                else:
                    queried_fake = output.detach()
                pred_fake = D(queried_fake)
                loss_D_fake = criterionGAN(pred_fake, False)
                # Combined loss and calculate gradients
                loss_D = (loss_D_real + loss_D_fake) * 0.5
                loss_D.backward()
                optimizer_D.step()

                # reduce
                self.writer.add_scalar('train_loss_G', loss_G.item(),
                                       iter_count)
                self.writer.add_scalar('train_loss_D', loss_D.item(),
                                       iter_count)
                results['train_loss_G'] += [loss_G]
                results['train_loss_D'] += [loss_D]
                results['train_step'] += [iter_count]

                if iter_count % self.train_conf.display_iter == 0 or iter_count == 1:
                    logger.info(
                        "@ epoch {:04d} iter {:08d} loss_G: {:.5f}, loss_G_GAN: {:.5f}, loss_D: {:.5f}, loss_D_real: {:.5f}, loss_D_fake: {:.5f}"
                        .format(epoch + 1, iter_count, loss_G.item(),
                                loss_G_GAN.item(), loss_D.item(),
                                loss_D_real.item(), loss_D_fake.item()))
                    if args.lambda_rote > 0:
                        logger.info(
                            "@ epoch {:04d} iter {:08d} loss_rote: {:.5f}".
                            format(epoch + 1, iter_count, loss_G_rote.item()))

            # snapshot model
            if (epoch + 1) % self.train_conf.snapshot_epoch == 0:
                logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1))
                snapshot(G,
                         optimizer_G,
                         self.config,
                         epoch + 1,
                         fname_prefix='G_')
                snapshot(D,
                         optimizer_G,
                         self.config,
                         epoch + 1,
                         fname_prefix='D_')

        pickle.dump(
            results,
            open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb'))
        self.writer.close()
        return 1
Example #9
0
    def train(self):
        ### create data loader
        train_dataset = eval(self.dataset_conf.loader_name)(self.config,
                                                            self.graphs_train,
                                                            tag='train')
        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=self.train_conf.batch_size,
            shuffle=self.train_conf.shuffle,
            num_workers=self.train_conf.num_workers,
            collate_fn=train_dataset.collate_fn,
            drop_last=False)

        # create models
        model = eval(self.model_conf.name)(self.config)
        print('number of parameters : {}'.format(
            sum([np.prod(x.shape) for x in model.parameters()])))

        if self.use_gpu:
            model = DataParallel(model, device_ids=self.gpus).to(self.device)

        # create optimizer
        params = filter(lambda p: p.requires_grad, model.parameters())
        if self.train_conf.optimizer == 'SGD':
            optimizer = optim.SGD(params,
                                  lr=self.train_conf.lr,
                                  momentum=self.train_conf.momentum,
                                  weight_decay=self.train_conf.wd)
        elif self.train_conf.optimizer == 'Adam':
            optimizer = optim.Adam(params,
                                   lr=self.train_conf.lr,
                                   weight_decay=self.train_conf.wd)
        else:
            raise ValueError("Non-supported optimizer!")

        early_stop = EarlyStopper([0.0], win_size=100, is_decrease=False)

        from copy import deepcopy
        lr_scheduler = optim.lr_scheduler.MultiStepLR(
            deepcopy(optimizer),
            milestones=self.train_conf.lr_decay_epoch,
            gamma=self.train_conf.lr_decay)

        # reset gradient
        optimizer.zero_grad()

        # resume training
        resume_epoch = 0
        if self.train_conf.is_resume:
            model_file = os.path.join(self.train_conf.resume_dir,
                                      self.train_conf.resume_model)
            load_model(model.module if self.use_gpu else model,
                       model_file,
                       self.device,
                       optimizer=optimizer,
                       scheduler=lr_scheduler)
            resume_epoch = self.train_conf.resume_epoch

        # Training Loop
        iter_count = 0
        results = defaultdict(list)
        for epoch in range(resume_epoch, self.train_conf.max_epoch):
            has_sampled = False
            model.train()
            # lr_scheduler.step()
            train_iterator = train_loader.__iter__()

            for inner_iter in range(len(train_loader) // self.num_gpus):
                optimizer.zero_grad()

                batch_data = []
                if self.use_gpu:
                    for _ in self.gpus:
                        data = train_iterator.next()
                        batch_data.append(data)
                        iter_count += 1

                avg_train_loss = .0
                for ff in range(self.dataset_conf.num_fwd_pass):
                    batch_fwd = []

                    if self.use_gpu:
                        for dd, gpu_id in enumerate(self.gpus):
                            data = {}
                            data['adj'] = batch_data[dd][ff]['adj'].pin_memory(
                            ).to(gpu_id, non_blocking=True)
                            data['edges'] = batch_data[dd][ff][
                                'edges'].pin_memory().to(gpu_id,
                                                         non_blocking=True)
                            data['node_idx_gnn'] = batch_data[dd][ff][
                                'node_idx_gnn'].pin_memory().to(
                                    gpu_id, non_blocking=True)
                            data['node_idx_feat'] = batch_data[dd][ff][
                                'node_idx_feat'].pin_memory().to(
                                    gpu_id, non_blocking=True)
                            data['label'] = batch_data[dd][ff][
                                'label'].pin_memory().to(gpu_id,
                                                         non_blocking=True)
                            data['att_idx'] = batch_data[dd][ff][
                                'att_idx'].pin_memory().to(gpu_id,
                                                           non_blocking=True)
                            data['subgraph_idx'] = batch_data[dd][ff][
                                'subgraph_idx'].pin_memory().to(
                                    gpu_id, non_blocking=True)
                            batch_fwd.append((data, ))

                    if batch_fwd:
                        train_loss = model(*batch_fwd).mean()
                        avg_train_loss += train_loss

                        # assign gradient
                        train_loss.backward()

                # clip_grad_norm_(model.parameters(), 5.0e-0)
                optimizer.step()
                avg_train_loss /= float(self.dataset_conf.num_fwd_pass)

                # reduce
                train_loss = float(avg_train_loss.data.cpu().numpy())

                self.writer.add_scalar('train_loss', train_loss, iter_count)
                results['train_loss'] += [train_loss]
                results['train_step'] += [iter_count]

                if iter_count % self.train_conf.display_iter == 0 or iter_count == 1:
                    logger.info(
                        "NLL Loss @ epoch {:04d} iteration {:08d} = {}".format(
                            epoch + 1, iter_count, train_loss))

            # snapshot model
            if (epoch + 1) % self.train_conf.snapshot_epoch == 0:
                logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1))
                snapshot(model.module if self.use_gpu else model,
                         optimizer,
                         self.config,
                         epoch + 1,
                         scheduler=lr_scheduler)

            if (epoch + 1) % 20 == 0 and not has_sampled:
                has_sampled = True
                print('saving graphs')
                model.eval()
                graphs_gen = [
                    get_graph(aa.cpu().data.numpy())
                    for aa in model.module._sampling(10)
                ]
                model.train()

                vis_graphs = []
                for gg in graphs_gen:
                    CGs = [gg.subgraph(c) for c in nx.connected_components(gg)]
                    CGs = sorted(CGs,
                                 key=lambda x: x.number_of_nodes(),
                                 reverse=True)
                    vis_graphs += [CGs[0]]

                total = len(vis_graphs)  #min(3, len(vis_graphs))
                draw_graph_list(vis_graphs[:total],
                                2,
                                int(total // 2),
                                fname='sample/gran_%d.png' % epoch,
                                layout='spring')

        pickle.dump(
            results,
            open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb'))
        self.writer.close()

        return 1
Example #10
0
    def train(self):
        ### create data loader
        train_dataset = eval(self.dataset_conf.loader_name)(self.config,
                                                            self.graphs_train,
                                                            tag='train')
        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=self.train_conf.batch_size,
            shuffle=self.train_conf.shuffle,
            num_workers=self.train_conf.num_workers,
            collate_fn=train_dataset.collate_fn,
            drop_last=False)

        # create models
        # model = eval(self.model_conf.name)(self.config)
        from model.transformer import make_model
        model = make_model(max_node=self.config.model.max_num_nodes,
                           d_out=20,
                           N=7,
                           d_model=64,
                           d_ff=64,
                           dropout=0.4)  # d_out, N, d_model, d_ff, h
        # d_out=20, N=15, d_model=16, d_ff=16, dropout=0.2) # d_out, N, d_model, d_ff, h
        # d_out=20, N=3, d_model=64, d_ff=64, dropout=0.1) # d_out, N, d_model, d_ff, h

        if self.use_gpu:
            model = DataParallel(model, device_ids=self.gpus).to(self.device)

        # create optimizer
        params = filter(lambda p: p.requires_grad, model.parameters())
        if self.train_conf.optimizer == 'SGD':
            optimizer = optim.SGD(params,
                                  lr=self.train_conf.lr,
                                  momentum=self.train_conf.momentum,
                                  weight_decay=self.train_conf.wd)
        elif self.train_conf.optimizer == 'Adam':
            optimizer = optim.Adam(params,
                                   lr=self.train_conf.lr,
                                   weight_decay=self.train_conf.wd)
        else:
            raise ValueError("Non-supported optimizer!")

        early_stop = EarlyStopper([0.0], win_size=100, is_decrease=False)
        lr_scheduler = optim.lr_scheduler.MultiStepLR(
            optimizer,
            milestones=self.train_conf.lr_decay_epoch,
            gamma=self.train_conf.lr_decay)

        # reset gradient
        optimizer.zero_grad()

        # resume training
        resume_epoch = 0
        if self.train_conf.is_resume:
            model_file = os.path.join(self.train_conf.resume_dir,
                                      self.train_conf.resume_model)
            load_model(model.module if self.use_gpu else model,
                       model_file,
                       self.device,
                       optimizer=optimizer,
                       scheduler=lr_scheduler)
            resume_epoch = self.train_conf.resume_epoch

        # Training Loop
        iter_count = 0
        results = defaultdict(list)
        for epoch in range(resume_epoch, self.train_conf.max_epoch):
            model.train()
            lr_scheduler.step()
            train_iterator = train_loader.__iter__()

            for inner_iter in range(len(train_loader) // self.num_gpus):
                optimizer.zero_grad()

                batch_data = []
                if self.use_gpu:
                    for _ in self.gpus:
                        data = train_iterator.next()
                        batch_data += [data]

                avg_train_loss = .0
                for ff in range(self.dataset_conf.num_fwd_pass):
                    batch_fwd = []

                    if self.use_gpu:
                        for dd, gpu_id in enumerate(self.gpus):
                            data = batch_data[dd]

                            adj, lens = data['adj'], data['lens']

                            # this is only for grid
                            # adj = adj[:, :, :100, :100]
                            # lens = [min(99, x) for x in lens]

                            adj = adj.to('cuda:%d' % gpu_id)

                            # build masks
                            node_feat, attn_mask, lens = preprocess(adj, lens)
                            batch_fwd.append(
                                (node_feat, attn_mask.clone(), lens))

                    if batch_fwd:
                        node_feat, attn_mask, lens = batch_fwd[0]
                        log_theta, log_alpha = model(*batch_fwd)

                        train_loss = model.module.mix_bern_loss(
                            log_theta, log_alpha, adj, lens)

                        avg_train_loss += train_loss

                        # assign gradient
                        train_loss.backward()

                # clip_grad_norm_(model.parameters(), 5.0e-0)
                optimizer.step()
                avg_train_loss /= float(self.dataset_conf.num_fwd_pass)

                # reduce
                train_loss = float(avg_train_loss.data.cpu().numpy())

                self.writer.add_scalar('train_loss', train_loss, iter_count)
                results['train_loss'] += [train_loss]
                results['train_step'] += [iter_count]

                if iter_count % self.train_conf.display_iter == 0 or iter_count == 1:
                    logger.info(
                        "NLL Loss @ epoch {:04d} iteration {:08d} = {}".format(
                            epoch + 1, iter_count, train_loss))

                if epoch % 50 == 0 and inner_iter == 0:
                    model.eval()
                    print('saving graphs')
                    graphs_gen = [get_graph(adj[0].cpu().data.numpy())] + [
                        get_graph(aa.cpu().data.numpy())
                        for aa in model.module.sample(
                            19, max_node=self.config.model.max_num_nodes)
                    ]
                    model.train()

                    vis_graphs = []
                    for gg in graphs_gen:
                        CGs = [
                            gg.subgraph(c) for c in nx.connected_components(gg)
                        ]
                        CGs = sorted(CGs,
                                     key=lambda x: x.number_of_nodes(),
                                     reverse=True)
                        try:
                            vis_graphs += [CGs[0]]
                        except:
                            pass

                    try:
                        total = len(vis_graphs)  #min(3, len(vis_graphs))
                        draw_graph_list(vis_graphs[:total],
                                        4,
                                        int(total // 4),
                                        fname='sample/trans_sl:%d_%d.png' %
                                        (int(model.module.self_loop), epoch),
                                        layout='spring')
                    except:
                        print('sample saving failed')

            # snapshot model
            if (epoch + 1) % self.train_conf.snapshot_epoch == 0:
                logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1))
                snapshot(model.module if self.use_gpu else model,
                         optimizer,
                         self.config,
                         epoch + 1,
                         scheduler=lr_scheduler)

        pickle.dump(
            results,
            open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb'))
        self.writer.close()

        return 1