def split_dataset_general(dataset, args): droplast = args.model == "diffpool" train_size = int(len(dataset) * args.train_ratio) test_size = int(len(dataset) * args.test_ratio) index = list(range(len(dataset))) random.shuffle(index) train_index = index[:train_size] test_index = index[-test_size:] bs = args.batch_size train_loader = DataLoader([dataset[i] for i in train_index], batch_size=bs, drop_last=droplast) test_loader = DataLoader([dataset[i] for i in test_index], batch_size=bs, drop_last=droplast) if args.train_ratio + args.test_ratio < 1: val_index = index[train_size:-test_size] valid_loader = DataLoader([dataset[i] for i in val_index], batch_size=bs, drop_last=droplast) else: valid_loader = test_loader return train_loader, valid_loader, test_loader
def _kfold_train(self): y = [x.y for x in self.data] kf = StratifiedKFold(n_splits=self.folds, shuffle=True, random_state=self.args.seed) acc = [] for train_index, test_index in kf.split(self.data, y=y): model = build_model(self.args) self.model = model.to(self.device) droplast = self.args.model == 'diffpool' self.train_loader = DataLoader([self.data[i] for i in train_index], batch_size=self.args.batch_size, drop_last=droplast) self.test_loader = DataLoader([self.data[i] for i in test_index], batch_size=self.args.batch_size, drop_last=droplast) self.val_loader = DataLoader([self.data[i] for i in test_index], batch_size=self.args.batch_size, drop_last=droplast) self.optimizer = torch.optim.Adam( self.model.parameters(), lr=self.args.lr, weight_decay=self.args.weight_decay) self.scheduler = torch.optim.lr_scheduler.StepLR( optimizer=self.optimizer, step_size=50, gamma=0.5) res = self._train() acc.append(res["Acc"]) return dict(Acc=sum(acc) / len(acc))
def split_dataset(cls, dataset, args): if "ModelNet" in args.dataset: train_data = [Data(x=d.pos, y=d.y) for d in dataset["train"]] test_data = [Data(x=d.pos, y=d.y) for d in dataset["test"]] train_loader = DataLoader(train_data, batch_size=args.batch_size, num_workers=6) test_loader = DataLoader(test_data, batch_size=args.batch_size, num_workers=6, shuffle=False) return train_loader, test_loader, test_loader else: random.shuffle(dataset) train_size = int(len(dataset) * args.train_ratio) test_size = int(len(dataset) * args.test_ratio) bs = args.batch_size train_loader = DataLoader(dataset[:train_size], batch_size=bs) test_loader = DataLoader(dataset[-test_size:], batch_size=bs) if args.train_ratio + args.test_ratio < 1: valid_loader = DataLoader(dataset[train_size:-test_size], batch_size=bs) else: valid_loader = test_loader return train_loader, valid_loader, test_loader
def split_dataset(cls, dataset, args): test_index = random.sample(range(len(dataset)), len(dataset) // 10) train_index = [x for x in range(len(dataset)) if x not in test_index] train_dataset = [dataset[i] for i in train_index] test_dataset = [dataset[i] for i in test_index] train_loader = DataLoader(train_dataset, batch_size=args.batch_size) test_loader = DataLoader(test_dataset, batch_size=args.batch_size) return train_loader, test_loader, test_loader
def distributed_dataloader(self, dataloader: DataLoader, dataset, rank): # TODO: just a toy implementation assert isinstance(dataloader, DataLoader) args, kwargs = dataloader.get_parameters() sampler = torch.utils.data.distributed.DistributedSampler( dataset, num_replicas=self.world_size, rank=rank) kwargs["sampler"] = sampler dataloader = dataloader.__class__(*args, **kwargs) return dataloader
def split_dataset(cls, dataset, args): random.shuffle(dataset) train_size = int(len(dataset) * args.train_ratio) test_size = int(len(dataset) * args.test_ratio) bs = args.batch_size train_loader = DataLoader(dataset[:train_size], batch_size=bs) test_loader = DataLoader(dataset[-test_size:], batch_size=bs) if args.train_ratio + args.test_ratio < 1: valid_loader = DataLoader(dataset[train_size:-test_size], batch_size=bs) else: valid_loader = test_loader return train_loader, valid_loader, test_loader
def get_loader(self, args): split_index = self.dataset.get_idx_split() train_loader = DataLoader(self.get_subset(split_index["train"]), batch_size=args.batch_size, shuffle=True) valid_loader = DataLoader(self.get_subset(split_index["valid"]), batch_size=args.batch_size, shuffle=False) test_loader = DataLoader(self.get_subset(split_index["test"]), batch_size=args.batch_size, shuffle=False) return train_loader, valid_loader, test_loader
def __init__(self, args, dataset=None, model=None): super(GraphClassification, self).__init__(args) dataset = build_dataset(args) if dataset is None else dataset args.max_graph_size = max([ds.num_nodes for ds in dataset]) args.num_features = dataset.num_features args.num_classes = dataset.num_classes args.use_unsup = False self.args = args self.kfold = args.kfold self.folds = 10 self.device = "cpu" if not torch.cuda.is_available( ) or args.cpu else args.device_id[0] if args.dataset.startswith("ogbg"): self.data = dataset.data self.train_loader, self.val_loader, self.test_loader = dataset.get_loader( args) model = build_model(args) if model is None else model else: self.data = dataset if self.data[0].x is None: self.data = node_degree_as_feature(dataset) args.num_features = self.data.num_features model = build_model(args) if model is None else model ( self.train_dataset, self.val_dataset, self.test_dataset, ) = model.split_dataset(self.data, args) self.train_loader = DataLoader(**self.train_dataset) self.val_loader = DataLoader(**self.val_dataset) self.test_loader = DataLoader(**self.test_dataset) self.model = model.to(self.device) self.set_loss_fn(dataset) self.set_evaluator(dataset) self.patience = args.patience self.max_epoch = args.max_epoch self.optimizer = torch.optim.Adam(self.model.parameters(), lr=args.lr, weight_decay=args.weight_decay) self.scheduler = torch.optim.lr_scheduler.StepLR( optimizer=self.optimizer, step_size=50, gamma=0.5)
def split_dataset(cls, dataset, args): if args.dataset == "QM9": test_dataset = dataset[:10000] val_dataset = dataset[10000:20000] train_dataset = dataset[20000:20000 + args.train_num] return DataLoader(train_dataset, batch_size=args.batch_size), DataLoader(val_dataset, batch_size=args.batch_size),\ DataLoader(test_dataset, batch_size=args.batch_size) else: test_index = random.sample(range(len(dataset)), len(dataset) // 10) train_index = [x for x in range(len(dataset)) if x not in test_index] train_dataset = [dataset[i] for i in train_index] test_dataset = [dataset[i] for i in test_index] train_loader = DataLoader(train_dataset, batch_size=args.batch_size) test_loader = DataLoader(test_dataset, batch_size=args.batch_size) return train_loader, test_loader, test_loader
def __init__(self, args): args.data_type = "unsupervised" super(InfoMaxTrainer, self).__init__(args) self.hidden_size = args.hidden_size self.dataloader = DataLoader(self.dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) self.model = GNN( num_layers=args.num_layers, hidden_size=args.hidden_size, JK=args.JK, dropout=args.dropout, input_layer=self.opt.get("input_layer", None), edge_encode=self.opt.get("edge_encode", None), edge_emb=self.opt.get("edge_emb", None), num_atom_type=self.opt.get("num_atom_type", None), num_chirality_tag=self.opt.get("num_chirality_tag", None), concat=self.opt["concat"], ) self.discriminator = Discriminator(args.hidden_size) self.loss_fn = nn.BCEWithLogitsLoss() self.optimizer = torch.optim.Adam(self.model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
def split_data(self): length = len(self.dataset) indices = np.arange(length) np.random.shuffle(indices) self.train_ratio = 0.6 train_index = torch.LongTensor(indices[: int(length * self.train_ratio)]) dataset = self.dataset[train_index] dataloader = DataLoader(dataset=dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers) return dataloader
def split_dataset(self, dataset, args): random.shuffle(dataset) # process each graph and add it into Data() as attribute tx for i, data in enumerate(dataset): new_feature = get_single_feature(dataset[i], args.num_features, args.num_classes, args.sample, args.neighbor, args.stride) dataset[i].tx = torch.from_numpy(new_feature) train_size = int(len(dataset) * args.train_ratio) test_size = int(len(dataset) * args.test_ratio) bs = args.batch_size train_loader = DataLoader(dataset[:train_size], batch_size=bs) test_loader = DataLoader(dataset[-test_size:], batch_size=bs) if args.train_ratio + args.test_ratio < 1: valid_loader = DataLoader(dataset[train_size:-test_size], batch_size=bs) else: valid_loader = test_loader return train_loader, valid_loader, test_loader
def test_step(self, dataset): device = self.device dataloader = DataLoader(dataset, batch_size=32, shuffle=False) preds = [] with torch.no_grad(): for batch in dataloader: preds.append(self.model(batch.to(device))) preds = torch.cat(preds).cpu().numpy() labels = np.array([g.y.item() for g in dataset]) result = evaluate_graph_embeddings_using_svm(preds, labels) self.note("test_metric", result["acc"]) self.note("std", result["std"])
def __init__(self, args, dataset=None, model=None): super(UnsupervisedGraphClassification, self).__init__(args) self.device = "cpu" if not torch.cuda.is_available( ) or args.cpu else args.device_id[0] dataset = build_dataset(args) if dataset is None else dataset if "gcc" in args.model: self.label = dataset.graph_labels[:, 0] self.data = dataset.graph_lists else: self.label = np.array([data.y for data in dataset]) self.data = [ Data(x=data.x, y=data.y, edge_index=data.edge_index, edge_attr=data.edge_attr, pos=data.pos).apply(lambda x: x.to(self.device)) for data in dataset ] args.num_features = dataset.num_features args.num_classes = args.hidden_size args.use_unsup = True if args.degree_feature: self.data = node_degree_as_feature(self.data) args.num_features = self.data[0].num_features self.num_graphs = len(self.data) self.num_classes = dataset.num_classes # self.label_matrix = np.zeros((self.num_graphs, self.num_classes)) # self.label_matrix[range(self.num_graphs), np.array([data.y for data in self.data], dtype=int)] = 1 self.model = build_model(args) if model is None else model self.model = self.model.to(self.device) self.model_name = args.model self.hidden_size = args.hidden_size self.num_shuffle = args.num_shuffle self.save_dir = args.save_dir self.epoch = args.epoch self.use_nn = args.model in ("infograph", ) if self.use_nn: self.optimizer = torch.optim.Adam(self.model.parameters(), lr=args.lr, weight_decay=args.weight_decay) self.data_loader = DataLoader(self.data, batch_size=args.batch_size, shuffle=True)
def __init__(self, args): super(UnsupervisedGraphClassification, self).__init__(args) dataset = build_dataset(args) self.label = np.array([data.y for data in dataset]) self.data = [ Data(x=data.x, y=data.y, edge_index=data.edge_index, edge_attr=data.edge_attr, pos=data.pos).apply(lambda x: x.cuda()) for data in dataset ] args.num_features = dataset.num_features args.num_classes = args.hidden_size args.use_unsup = True if args.degree_feature: self.data = node_degree_as_feature(self.data) args.num_features = self.data[0].num_features self.num_graphs = len(self.data) self.num_classes = dataset.num_classes # self.label_matrix = np.zeros((self.num_graphs, self.num_classes)) # self.label_matrix[range(self.num_graphs), np.array([data.y for data in self.data], dtype=int)] = 1 self.model = build_model(args) self.model = self.model.cuda() self.model_name = args.model self.hidden_size = args.hidden_size self.num_shuffle = args.num_shuffle self.save_dir = args.save_dir self.epochs = args.epochs self.use_nn = args.nn if args.nn: self.optimizer = torch.optim.Adam(self.model.parameters(), lr=args.lr, weight_decay=args.weight_decay) self.data_loader = DataLoader(self.data, batch_size=args.batch_size, shuffle=True)
def split_dataset(cls, dataset, args): if args.dataset == "qm9": test_dataset = dataset[:10000] val_dataset = dataset[10000:20000] train_dataset = dataset[20000:20000 + args.train_num] return DataLoader(train_dataset, batch_size=args.batch_size), DataLoader(val_dataset, batch_size=args.batch_size),\ DataLoader(test_dataset, batch_size=args.batch_size) else: random.shuffle(dataset) train_size = int(len(dataset) * args.train_ratio) test_size = int(len(dataset) * args.test_ratio) bs = args.batch_size train_loader = DataLoader(dataset[:train_size], batch_size=bs) test_loader = DataLoader(dataset[-test_size:], batch_size=bs) if args.train_ratio + args.test_ratio < 1: valid_loader = DataLoader(dataset[train_size:-test_size], batch_size=bs) else: valid_loader = test_loader return train_loader, valid_loader, test_loader
def test_wrapper(self): return DataLoader(self.dataset[self.split_idx[2]], batch_size=self.batch_size, shuffle=False, num_workers=4)
def val_wrapper(self): if self.split_idx[1] is not None: return DataLoader(self.dataset[self.split_idx[1]], batch_size=self.batch_size, shuffle=False, num_workers=4)
def train_wrapper(self): return DataLoader(self.dataset[self.split_idx[0]], batch_size=self.batch_size, shuffle=True, num_workers=4)