Ejemplo n.º 1
0
    def train(self):
        ### create data loader
        train_dataset = eval(self.dataset_conf.loader_name)(self.config,
                                                            self.graphs_train,
                                                            tag='train')
        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=self.train_conf.batch_size,
            shuffle=self.train_conf.shuffle,
            num_workers=self.train_conf.num_workers,
            collate_fn=train_dataset.collate_fn,
            drop_last=False)

        # create models
        model = eval(self.model_conf.name)(self.config)

        if self.use_gpu:
            model = DataParallel(model, device_ids=self.gpus).to(self.device)

        # create optimizer
        params = filter(lambda p: p.requires_grad, model.parameters())
        if self.train_conf.optimizer == 'SGD':
            optimizer = optim.SGD(params,
                                  lr=self.train_conf.lr,
                                  momentum=self.train_conf.momentum,
                                  weight_decay=self.train_conf.wd)
        elif self.train_conf.optimizer == 'Adam':
            optimizer = optim.Adam(params,
                                   lr=self.train_conf.lr,
                                   weight_decay=self.train_conf.wd)
        else:
            raise ValueError("Non-supported optimizer!")

        early_stop = EarlyStopper([0.0], win_size=100, is_decrease=False)
        lr_scheduler = optim.lr_scheduler.MultiStepLR(
            optimizer,
            milestones=self.train_conf.lr_decay_epoch,
            gamma=self.train_conf.lr_decay)

        # reset gradient
        optimizer.zero_grad()

        # resume training
        resume_epoch = 0
        if self.train_conf.is_resume:
            model_file = os.path.join(self.train_conf.resume_dir,
                                      self.train_conf.resume_model)
            load_model(model.module if self.use_gpu else model,
                       model_file,
                       self.device,
                       optimizer=optimizer,
                       scheduler=lr_scheduler)
            resume_epoch = self.train_conf.resume_epoch

        # Training Loop
        iter_count = 0
        results = defaultdict(list)
        for epoch in range(resume_epoch, self.train_conf.max_epoch):
            model.train()
            lr_scheduler.step()
            train_iterator = train_loader.__iter__()

            for inner_iter in range(len(train_loader) // self.num_gpus):
                optimizer.zero_grad()

                batch_data = []
                if self.use_gpu:
                    for _ in self.gpus:
                        data = train_iterator.next()
                        batch_data.append(data)
                        iter_count += 1

                avg_train_loss = .0
                for ff in range(self.dataset_conf.num_fwd_pass):
                    batch_fwd = []

                    if self.use_gpu:
                        for dd, gpu_id in enumerate(self.gpus):
                            data = {}
                            data['adj'] = batch_data[dd][ff]['adj'].pin_memory(
                            ).to(gpu_id, non_blocking=True)
                            data['edges'] = batch_data[dd][ff][
                                'edges'].pin_memory().to(gpu_id,
                                                         non_blocking=True)
                            data['node_idx_gnn'] = batch_data[dd][ff][
                                'node_idx_gnn'].pin_memory().to(
                                    gpu_id, non_blocking=True)
                            data['node_idx_feat'] = batch_data[dd][ff][
                                'node_idx_feat'].pin_memory().to(
                                    gpu_id, non_blocking=True)
                            data['label'] = batch_data[dd][ff][
                                'label'].pin_memory().to(gpu_id,
                                                         non_blocking=True)
                            data['att_idx'] = batch_data[dd][ff][
                                'att_idx'].pin_memory().to(gpu_id,
                                                           non_blocking=True)
                            data['subgraph_idx'] = batch_data[dd][ff][
                                'subgraph_idx'].pin_memory().to(
                                    gpu_id, non_blocking=True)
                            batch_fwd.append((data, ))

                    if batch_fwd:
                        train_loss = model(*batch_fwd).mean()
                        avg_train_loss += train_loss

                        # assign gradient
                        train_loss.backward()

                # clip_grad_norm_(model.parameters(), 5.0e-0)
                optimizer.step()
                avg_train_loss /= float(self.dataset_conf.num_fwd_pass)

                # reduce
                train_loss = float(avg_train_loss.data.cpu().numpy())

                self.writer.add_scalar('train_loss', train_loss, iter_count)
                results['train_loss'] += [train_loss]
                results['train_step'] += [iter_count]

                if iter_count % self.train_conf.display_iter == 0 or iter_count == 1:
                    logger.info(
                        "NLL Loss @ epoch {:04d} iteration {:08d} = {}".format(
                            epoch + 1, iter_count, train_loss))

            # snapshot model
            if (epoch + 1) % self.train_conf.snapshot_epoch == 0:
                logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1))
                snapshot(model.module if self.use_gpu else model,
                         optimizer,
                         self.config,
                         epoch + 1,
                         scheduler=lr_scheduler)

        pickle.dump(
            results,
            open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb'))
        self.writer.close()

        return 1
Ejemplo n.º 2
0
    def train(self):
        ### create data loader
        train_dataset = eval(self.dataset_conf.loader_name)(self.config,
                                                            self.graphs_train,
                                                            tag='train')
        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=self.train_conf.batch_size,
            shuffle=self.train_conf.shuffle,
            num_workers=self.train_conf.num_workers,
            collate_fn=train_dataset.collate_fn,
            drop_last=False)

        # create models
        model = eval(self.model_conf.name)(self.config)
        print('number of parameters : {}'.format(
            sum([np.prod(x.shape) for x in model.parameters()])))

        if self.use_gpu:
            model = DataParallel(model, device_ids=self.gpus).to(self.device)

        # create optimizer
        params = filter(lambda p: p.requires_grad, model.parameters())
        if self.train_conf.optimizer == 'SGD':
            optimizer = optim.SGD(params,
                                  lr=self.train_conf.lr,
                                  momentum=self.train_conf.momentum,
                                  weight_decay=self.train_conf.wd)
        elif self.train_conf.optimizer == 'Adam':
            optimizer = optim.Adam(params,
                                   lr=self.train_conf.lr,
                                   weight_decay=self.train_conf.wd)
        else:
            raise ValueError("Non-supported optimizer!")

        early_stop = EarlyStopper([0.0], win_size=100, is_decrease=False)

        from copy import deepcopy
        lr_scheduler = optim.lr_scheduler.MultiStepLR(
            deepcopy(optimizer),
            milestones=self.train_conf.lr_decay_epoch,
            gamma=self.train_conf.lr_decay)

        # reset gradient
        optimizer.zero_grad()

        # resume training
        resume_epoch = 0
        if self.train_conf.is_resume:
            model_file = os.path.join(self.train_conf.resume_dir,
                                      self.train_conf.resume_model)
            load_model(model.module if self.use_gpu else model,
                       model_file,
                       self.device,
                       optimizer=optimizer,
                       scheduler=lr_scheduler)
            resume_epoch = self.train_conf.resume_epoch

        # Training Loop
        iter_count = 0
        results = defaultdict(list)
        for epoch in range(resume_epoch, self.train_conf.max_epoch):
            has_sampled = False
            model.train()
            # lr_scheduler.step()
            train_iterator = train_loader.__iter__()

            for inner_iter in range(len(train_loader) // self.num_gpus):
                optimizer.zero_grad()

                batch_data = []
                if self.use_gpu:
                    for _ in self.gpus:
                        data = train_iterator.next()
                        batch_data.append(data)
                        iter_count += 1

                avg_train_loss = .0
                for ff in range(self.dataset_conf.num_fwd_pass):
                    batch_fwd = []

                    if self.use_gpu:
                        for dd, gpu_id in enumerate(self.gpus):
                            data = {}
                            data['adj'] = batch_data[dd][ff]['adj'].pin_memory(
                            ).to(gpu_id, non_blocking=True)
                            data['edges'] = batch_data[dd][ff][
                                'edges'].pin_memory().to(gpu_id,
                                                         non_blocking=True)
                            data['node_idx_gnn'] = batch_data[dd][ff][
                                'node_idx_gnn'].pin_memory().to(
                                    gpu_id, non_blocking=True)
                            data['node_idx_feat'] = batch_data[dd][ff][
                                'node_idx_feat'].pin_memory().to(
                                    gpu_id, non_blocking=True)
                            data['label'] = batch_data[dd][ff][
                                'label'].pin_memory().to(gpu_id,
                                                         non_blocking=True)
                            data['att_idx'] = batch_data[dd][ff][
                                'att_idx'].pin_memory().to(gpu_id,
                                                           non_blocking=True)
                            data['subgraph_idx'] = batch_data[dd][ff][
                                'subgraph_idx'].pin_memory().to(
                                    gpu_id, non_blocking=True)
                            batch_fwd.append((data, ))

                    if batch_fwd:
                        train_loss = model(*batch_fwd).mean()
                        avg_train_loss += train_loss

                        # assign gradient
                        train_loss.backward()

                # clip_grad_norm_(model.parameters(), 5.0e-0)
                optimizer.step()
                avg_train_loss /= float(self.dataset_conf.num_fwd_pass)

                # reduce
                train_loss = float(avg_train_loss.data.cpu().numpy())

                self.writer.add_scalar('train_loss', train_loss, iter_count)
                results['train_loss'] += [train_loss]
                results['train_step'] += [iter_count]

                if iter_count % self.train_conf.display_iter == 0 or iter_count == 1:
                    logger.info(
                        "NLL Loss @ epoch {:04d} iteration {:08d} = {}".format(
                            epoch + 1, iter_count, train_loss))

            # snapshot model
            if (epoch + 1) % self.train_conf.snapshot_epoch == 0:
                logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1))
                snapshot(model.module if self.use_gpu else model,
                         optimizer,
                         self.config,
                         epoch + 1,
                         scheduler=lr_scheduler)

            if (epoch + 1) % 20 == 0 and not has_sampled:
                has_sampled = True
                print('saving graphs')
                model.eval()
                graphs_gen = [
                    get_graph(aa.cpu().data.numpy())
                    for aa in model.module._sampling(10)
                ]
                model.train()

                vis_graphs = []
                for gg in graphs_gen:
                    CGs = [gg.subgraph(c) for c in nx.connected_components(gg)]
                    CGs = sorted(CGs,
                                 key=lambda x: x.number_of_nodes(),
                                 reverse=True)
                    vis_graphs += [CGs[0]]

                total = len(vis_graphs)  #min(3, len(vis_graphs))
                draw_graph_list(vis_graphs[:total],
                                2,
                                int(total // 2),
                                fname='sample/gran_%d.png' % epoch,
                                layout='spring')

        pickle.dump(
            results,
            open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb'))
        self.writer.close()

        return 1
    def train(self):
        torch.autograd.set_detect_anomaly(True)

        ### create data loader
        train_dataset = eval(self.dataset_conf.loader_name)(self.config,
                                                            self.graphs_train,
                                                            tag='train')
        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=self.train_conf.batch_size,
            shuffle=self.train_conf.shuffle,  # true for grid
            num_workers=self.train_conf.num_workers,
            collate_fn=train_dataset.collate_fn,
            drop_last=False)

        # create models
        model = eval(self.model_conf.name)(self.config)
        criterion = nn.BCEWithLogitsLoss()

        if self.use_gpu:
            model = DataParallel(model, device_ids=self.gpus).to(self.device)
            criterion = criterion.cuda()
        model.train()

        # create optimizer
        params = filter(lambda p: p.requires_grad, model.parameters())
        if self.train_conf.optimizer == 'SGD':
            optimizer = optim.SGD(params,
                                  lr=self.train_conf.lr,
                                  momentum=self.train_conf.momentum,
                                  weight_decay=self.train_conf.wd)
        elif self.train_conf.optimizer == 'Adam':
            optimizer = optim.Adam(params,
                                   lr=self.train_conf.lr,
                                   weight_decay=self.train_conf.wd)
        else:
            raise ValueError("Non-supported optimizer!")

        # TODO: not used?
        early_stop = EarlyStopper([0.0], win_size=100, is_decrease=False)
        lr_scheduler = optim.lr_scheduler.MultiStepLR(
            optimizer,
            milestones=self.train_conf.lr_decay_epoch,
            gamma=self.train_conf.lr_decay)

        # reset gradient
        optimizer.zero_grad()

        best_acc = 0.
        # resume training
        # TODO: record resume_epoch to the saved file
        resume_epoch = 0
        if self.train_conf.is_resume:
            model_file = os.path.join(self.train_conf.resume_dir,
                                      self.train_conf.resume_model)
            load_model(model.module if self.use_gpu else model,
                       model_file,
                       self.device,
                       optimizer=optimizer,
                       scheduler=lr_scheduler)
            resume_epoch = self.train_conf.resume_epoch

        # Training Loop
        iter_count = 0
        results = defaultdict(list)
        for epoch in range(resume_epoch, self.train_conf.max_epoch):
            model.train()
            train_iterator = train_loader.__iter__()

            avg_acc_whole_epoch = 0.
            cnt = 0.

            for inner_iter in range(len(train_loader) // self.num_gpus):
                optimizer.zero_grad()

                batch_data = []
                if self.use_gpu:
                    for _ in self.gpus:
                        data = train_iterator.next()
                        batch_data.append(data)
                        iter_count += 1

                avg_train_loss = .0
                avg_acc = 0.
                for ff in range(self.dataset_conf.num_fwd_pass):
                    batch_fwd = []

                    if self.use_gpu:
                        for dd, gpu_id in enumerate(self.gpus):
                            data = {}
                            data['adj'] = batch_data[dd][ff]['adj'].pin_memory(
                            ).to(gpu_id, non_blocking=True)
                            data['edges'] = batch_data[dd][ff][
                                'edges'].pin_memory().to(gpu_id,
                                                         non_blocking=True)
                            # data['node_idx_gnn'] = batch_data[dd][ff]['node_idx_gnn'].pin_memory().to(gpu_id, non_blocking=True)
                            # data['node_idx_feat'] = batch_data[dd][ff]['node_idx_feat'].pin_memory().to(gpu_id, non_blocking=True)
                            # data['label'] = batch_data[dd][ff]['label'].pin_memory().to(gpu_id, non_blocking=True)
                            # data['att_idx'] = batch_data[dd][ff]['att_idx'].pin_memory().to(gpu_id, non_blocking=True)
                            data['subgraph_idx'] = batch_data[dd][ff][
                                'subgraph_idx'].pin_memory().to(
                                    gpu_id, non_blocking=True)
                            data['complete_graph_label'] = batch_data[dd][ff][
                                'complete_graph_label'].pin_memory().to(
                                    gpu_id, non_blocking=True)
                            batch_fwd.append((data, ))

                    pred = model(*batch_fwd)
                    label = data['complete_graph_label'][:, None]
                    train_loss = criterion(pred, label).mean()
                    train_loss.backward()

                    pred = (torch.sigmoid(pred) > 0.5).type_as(label)
                    avg_acc += (pred.eq(label)).float().mean().item()

                    avg_train_loss += train_loss.item()

                    # assign gradient

                # clip_grad_norm_(model.parameters(), 5.0e-0)
                optimizer.step()
                lr_scheduler.step()
                avg_train_loss /= self.dataset_conf.num_fwd_pass  # num_fwd_pass always 1
                avg_acc /= self.dataset_conf.num_fwd_pass

                avg_acc_whole_epoch += avg_acc
                cnt += len(data['complete_graph_label'])

                # reduce
                self.writer.add_scalar('train_loss', avg_train_loss,
                                       iter_count)
                self.writer.add_scalar('train_acc', avg_acc, iter_count)
                results['train_loss'] += [avg_train_loss]
                results['train_acc'] += [avg_acc]
                results['train_step'] += [iter_count]

                # if iter_count % self.train_conf.display_iter == 0 or iter_count == 1:
                #   logger.info("NLL Loss @ epoch {:04d} iteration {:08d} = {}\tAcc = {}".format(epoch + 1, iter_count, train_loss, avg_acc))

            avg_acc_whole_epoch /= cnt
            is_new_best = avg_acc_whole_epoch > best_acc
            if is_new_best:
                logger.info('!!! New best')
                best_acc = avg_acc_whole_epoch
            logger.info("Avg acc = {} @ epoch {:04d}".format(
                avg_acc_whole_epoch, epoch + 1))

            # snapshot model
            if (epoch +
                    1) % self.train_conf.snapshot_epoch == 0 or is_new_best:
                logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1))
                snapshot(model.module if self.use_gpu else model,
                         optimizer,
                         self.config,
                         epoch + 1,
                         scheduler=lr_scheduler)

        pickle.dump(
            results,
            open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb'))
        self.writer.close()

        return 1
Ejemplo n.º 4
0
    def train(self):
        ### create data loader
        train_dataset = eval(self.dataset_conf.loader_name)(self.config,
                                                            self.graphs_train,
                                                            tag='train')
        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=self.train_conf.batch_size,
            shuffle=self.train_conf.shuffle,
            num_workers=self.train_conf.num_workers,
            collate_fn=train_dataset.collate_fn,
            drop_last=False)

        # create models
        # model = eval(self.model_conf.name)(self.config)
        from model.transformer import make_model
        model = make_model(max_node=self.config.model.max_num_nodes,
                           d_out=20,
                           N=7,
                           d_model=64,
                           d_ff=64,
                           dropout=0.4)  # d_out, N, d_model, d_ff, h
        # d_out=20, N=15, d_model=16, d_ff=16, dropout=0.2) # d_out, N, d_model, d_ff, h
        # d_out=20, N=3, d_model=64, d_ff=64, dropout=0.1) # d_out, N, d_model, d_ff, h

        if self.use_gpu:
            model = DataParallel(model, device_ids=self.gpus).to(self.device)

        # create optimizer
        params = filter(lambda p: p.requires_grad, model.parameters())
        if self.train_conf.optimizer == 'SGD':
            optimizer = optim.SGD(params,
                                  lr=self.train_conf.lr,
                                  momentum=self.train_conf.momentum,
                                  weight_decay=self.train_conf.wd)
        elif self.train_conf.optimizer == 'Adam':
            optimizer = optim.Adam(params,
                                   lr=self.train_conf.lr,
                                   weight_decay=self.train_conf.wd)
        else:
            raise ValueError("Non-supported optimizer!")

        early_stop = EarlyStopper([0.0], win_size=100, is_decrease=False)
        lr_scheduler = optim.lr_scheduler.MultiStepLR(
            optimizer,
            milestones=self.train_conf.lr_decay_epoch,
            gamma=self.train_conf.lr_decay)

        # reset gradient
        optimizer.zero_grad()

        # resume training
        resume_epoch = 0
        if self.train_conf.is_resume:
            model_file = os.path.join(self.train_conf.resume_dir,
                                      self.train_conf.resume_model)
            load_model(model.module if self.use_gpu else model,
                       model_file,
                       self.device,
                       optimizer=optimizer,
                       scheduler=lr_scheduler)
            resume_epoch = self.train_conf.resume_epoch

        # Training Loop
        iter_count = 0
        results = defaultdict(list)
        for epoch in range(resume_epoch, self.train_conf.max_epoch):
            model.train()
            lr_scheduler.step()
            train_iterator = train_loader.__iter__()

            for inner_iter in range(len(train_loader) // self.num_gpus):
                optimizer.zero_grad()

                batch_data = []
                if self.use_gpu:
                    for _ in self.gpus:
                        data = train_iterator.next()
                        batch_data += [data]

                avg_train_loss = .0
                for ff in range(self.dataset_conf.num_fwd_pass):
                    batch_fwd = []

                    if self.use_gpu:
                        for dd, gpu_id in enumerate(self.gpus):
                            data = batch_data[dd]

                            adj, lens = data['adj'], data['lens']

                            # this is only for grid
                            # adj = adj[:, :, :100, :100]
                            # lens = [min(99, x) for x in lens]

                            adj = adj.to('cuda:%d' % gpu_id)

                            # build masks
                            node_feat, attn_mask, lens = preprocess(adj, lens)
                            batch_fwd.append(
                                (node_feat, attn_mask.clone(), lens))

                    if batch_fwd:
                        node_feat, attn_mask, lens = batch_fwd[0]
                        log_theta, log_alpha = model(*batch_fwd)

                        train_loss = model.module.mix_bern_loss(
                            log_theta, log_alpha, adj, lens)

                        avg_train_loss += train_loss

                        # assign gradient
                        train_loss.backward()

                # clip_grad_norm_(model.parameters(), 5.0e-0)
                optimizer.step()
                avg_train_loss /= float(self.dataset_conf.num_fwd_pass)

                # reduce
                train_loss = float(avg_train_loss.data.cpu().numpy())

                self.writer.add_scalar('train_loss', train_loss, iter_count)
                results['train_loss'] += [train_loss]
                results['train_step'] += [iter_count]

                if iter_count % self.train_conf.display_iter == 0 or iter_count == 1:
                    logger.info(
                        "NLL Loss @ epoch {:04d} iteration {:08d} = {}".format(
                            epoch + 1, iter_count, train_loss))

                if epoch % 50 == 0 and inner_iter == 0:
                    model.eval()
                    print('saving graphs')
                    graphs_gen = [get_graph(adj[0].cpu().data.numpy())] + [
                        get_graph(aa.cpu().data.numpy())
                        for aa in model.module.sample(
                            19, max_node=self.config.model.max_num_nodes)
                    ]
                    model.train()

                    vis_graphs = []
                    for gg in graphs_gen:
                        CGs = [
                            gg.subgraph(c) for c in nx.connected_components(gg)
                        ]
                        CGs = sorted(CGs,
                                     key=lambda x: x.number_of_nodes(),
                                     reverse=True)
                        try:
                            vis_graphs += [CGs[0]]
                        except:
                            pass

                    try:
                        total = len(vis_graphs)  #min(3, len(vis_graphs))
                        draw_graph_list(vis_graphs[:total],
                                        4,
                                        int(total // 4),
                                        fname='sample/trans_sl:%d_%d.png' %
                                        (int(model.module.self_loop), epoch),
                                        layout='spring')
                    except:
                        print('sample saving failed')

            # snapshot model
            if (epoch + 1) % self.train_conf.snapshot_epoch == 0:
                logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1))
                snapshot(model.module if self.use_gpu else model,
                         optimizer,
                         self.config,
                         epoch + 1,
                         scheduler=lr_scheduler)

        pickle.dump(
            results,
            open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb'))
        self.writer.close()

        return 1