def __init__(self, params): self.params = params self.postfix = time.strftime('%d_%m_%Y') + '_' + time.strftime( '%H:%M:%S') self.prj_path = Path(__file__).parent.resolve() self.device = torch.device('cpu' if self.params.gpu == -1 else f'cuda:{params.gpu}') if self.params.evaluate: self.total_cell, self.num_genes, self.num_classes, self.id2label, self.test_dict, self.map_dict, self.time = load_data( params) else: self.total_cell, self.num_genes, self.num_classes, self.id2label, self.test_dict, self.time = load_data( params) """ test_dict = { 'graph': test_graph_dict, 'nid': test_index_dict, 'mask': test_mask_dict """ self.model = GNN(in_feats=params.dense_dim, n_hidden=params.hidden_dim, n_classes=self.num_classes, n_layers=1, gene_num=self.num_genes, activation=F.relu, dropout=params.dropout) self.load_model() self.num_neighbors = self.total_cell + self.num_genes self.model.to(self.device)
def __init__(self, params): self.params = params self.prj_path = Path(__file__).parent.resolve() self.save_path = self.prj_path / 'pretrained' / f'{self.params.species}' / 'models' if not self.save_path.exists(): self.save_path.mkdir(parents=True) self.device = torch.device('cpu' if self.params.gpu == -1 else f'cuda:{params.gpu}') self.num_cells, self.num_genes, self.num_labels, self.graph, self.train_ids, self.test_ids, self.labels = load_data_internal( params) self.labels = self.labels.to(self.device) self.model = GNN(in_feats=self.params.dense_dim, n_hidden=self.params.hidden_dim, n_classes=self.num_labels, n_layers=self.params.n_layers, gene_num=self.num_genes, activation=F.relu, dropout=self.params.dropout).to(self.device) self.optimizer = torch.optim.Adam( self.model.parameters(), lr=self.params.lr, weight_decay=self.params.weight_decay) self.loss_fn = nn.CrossEntropyLoss(reduction='sum') if self.params.num_neighbors == 0: self.num_neighbors = self.num_cells + self.num_genes else: self.num_neighbors = self.params.num_neighbors print( f"Train Number: {len(self.train_ids)}, Test Number: {len(self.test_ids)}" )
def main(): gnn = GNN(7, 96, layers, class_num); optimizer = tf.keras.optimizers.Adam(1e-3); trainset = tf.data.TFRecordDataset(join('datasets', 'trainset.tfrecord')).repeat(-1).map(parse_function).batch(1).prefetch(tf.data.experimental.AUTOTUNE); if False == exists('checkpoints'): mkdir('checkpoints'); checkpoint = tf.train.Checkpoint(model = gnn, optimizer = optimizer); checkpoint.restore(tf.train.latest_checkpoint('checkpoints')); log = tf.summary.create_file_writer('checkpoints'); avg_loss = tf.keras.metrics.Mean(name = 'loss', dtype = tf.float32); for embeddings, _1_jump_adj, region_types in trainset: # embeddings.shape = (1, N, 7), feature vectors of nodes # _1_jump_adj.shape = (1, N, N), adjacent matrix # region_types.shape = (1, N), class of nodes row_sum = tf.math.reduce_sum(_1_jump_adj, axis = -1, keepdims = 1); _1_jump_adj = _1_jump_adj / row_sum; with tf.GradientTape() as tape: features, adjacent = gnn(embeddings); # features.shape = (1, N, class_num), adjacent.shape = (1, N, N, jumps = 16) class_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True)(region_types, features); def body(i, n_jump_adj, loss): loss += tf.keras.losses.MSE(tf.keras.layers.Flatten()(n_jump_adj), tf.keras.layers.Flatten()(adjacent[:,:,:,i])); i += 1; n_jump_adj = tf.linalg.matmul(_1_jump_adj, n_jump_adj); # n_jump_adj.shape = (1, N, N) return i, n_jump_adj, loss; _, _, edge_loss = tf.while_loop(lambda i, n_jump_adj, loss: i < adjacent.shape[-1], body, loop_vars = (1, _1_jump_adj, 0)); loss = class_loss + edge_loss; avg_loss.update_state(loss); if tf.equal(optimizer.iterations % 100, 0): with log.as_default(): tf.summary.scalar('loss', avg_loss.result(), step = optimizer.iterations); print('Step #%d Loss: %.6f' % (optimizer.iterations, avg_loss.result())); if avg_loss.result() < 0.01: break; avg_loss.reset_states(); grads = tape.gradient(loss, gnn.trainable_variables); optimizer.apply_gradients(zip(grads, gnn.trainable_variables)); if tf.equal(optimizer.iterations % 2000, 0): checkpoint.save(join('checkpoints', 'ckpt')); if Fasle == exists('model'): mkdir('model'); gnn.save(join('model', 'gnn.h5'));
class Trainer: def __init__(self, params): self.params = params self.prj_path = Path(__file__).parent.resolve() self.save_path = self.prj_path / 'pretrained' / f'{self.params.species}' / 'models' if not self.save_path.exists(): self.save_path.mkdir(parents=True) self.device = torch.device('cpu' if self.params.gpu == -1 else f'cuda:{params.gpu}') self.num_cells, self.num_genes, self.num_labels, self.graph, self.train_ids, self.test_ids, self.labels = load_data_internal( params) self.labels = self.labels.to(self.device) self.model = GNN(in_feats=self.params.dense_dim, n_hidden=self.params.hidden_dim, n_classes=self.num_labels, n_layers=self.params.n_layers, gene_num=self.num_genes, activation=F.relu, dropout=self.params.dropout).to(self.device) self.optimizer = torch.optim.Adam( self.model.parameters(), lr=self.params.lr, weight_decay=self.params.weight_decay) self.loss_fn = nn.CrossEntropyLoss(reduction='sum') if self.params.num_neighbors == 0: self.num_neighbors = self.num_cells + self.num_genes else: self.num_neighbors = self.params.num_neighbors print( f"Train Number: {len(self.train_ids)}, Test Number: {len(self.test_ids)}" ) def fit(self): max_test_acc, _train_acc, _epoch = 0, 0, 0 for epoch in range(self.params.n_epochs): loss = self.train() train_correct, train_unsure = self.evaluate( self.train_ids, 'train') train_acc = train_correct / len(self.train_ids) test_correct, test_unsure = self.evaluate(self.test_ids, 'test') test_acc = test_correct / len(self.test_ids) if max_test_acc <= test_acc: final_test_correct_num = test_correct final_test_unsure_num = test_unsure _train_acc = train_acc _epoch = epoch max_test_acc = test_acc self.save_model() print( f">>>>Epoch {epoch:04d}: Train Acc {train_acc:.4f}, Loss {loss / len(self.train_ids):.4f}, Test correct {test_correct}, " f"Test unsure {test_unsure}, Test Acc {test_acc:.4f}") if train_acc == 1: break print( f"---{self.params.species} {self.params.tissue} Best test result:---" ) print( f"Epoch {_epoch:04d}, Train Acc {_train_acc:.4f}, Test Correct Num {final_test_correct_num}, Test Total Num {len(self.test_ids)}, Test Unsure Num {final_test_unsure_num}, Test Acc {final_test_correct_num / len(self.test_ids):.4f}" ) def train(self): self.model.train() total_loss = 0 for batch, nf in enumerate( NeighborSampler(g=self.graph, batch_size=self.params.batch_size, expand_factor=self.num_neighbors, num_hops=self.params.n_layers, neighbor_type='in', shuffle=True, num_workers=8, seed_nodes=self.train_ids)): nf.copy_from_parent( ) # Copy node/edge features from the parent graph. logits = self.model(nf) batch_nids = nf.layer_parent_nid(-1).type( torch.long).to(device=self.device) loss = self.loss_fn(logits, self.labels[batch_nids]) self.optimizer.zero_grad() loss.backward() self.optimizer.step() total_loss += loss.item() return total_loss def evaluate(self, ids, type='test'): self.model.eval() total_correct, total_unsure = 0, 0 for nf in NeighborSampler(g=self.graph, batch_size=self.params.batch_size, expand_factor=self.num_cells + self.num_genes, num_hops=params.n_layers, neighbor_type='in', shuffle=True, num_workers=8, seed_nodes=ids): nf.copy_from_parent( ) # Copy node/edge features from the parent graph. with torch.no_grad(): logits = self.model(nf).cpu() batch_nids = nf.layer_parent_nid(-1).type(torch.long) logits = nn.functional.softmax(logits, dim=1).numpy() label_list = self.labels.cpu()[batch_nids] for pred, label in zip(logits, label_list): max_prob = pred.max().item() if max_prob < self.params.unsure_rate / self.num_labels: total_unsure += 1 elif pred.argmax().item() == label: total_correct += 1 return total_correct, total_unsure def save_model(self): state = { 'model': self.model.state_dict(), 'optimizer': self.optimizer.state_dict() } torch.save( state, self.save_path / f"{self.params.species}-{self.params.tissue}.pt")
# Yields indices to split data into training, validation and test sets idx = np.random.permutation(n) idx_train = idx[:int(0.6 * n)] idx_val = idx[int(0.6 * n):int(0.8 * n)] idx_test = idx[int(0.8 * n):] # Transform the numpy matrices/vectors to torch tensors features = torch.FloatTensor(features) y = torch.LongTensor(np.argmax(class_labels, axis=1)) adj = torch.FloatTensor(adj) idx_train = torch.LongTensor(idx_train) idx_val = torch.LongTensor(idx_val) idx_test = torch.LongTensor(idx_test) # Creates the model and specifies the optimizer model = GNN(features.shape[1], n_hidden_1, n_hidden_2, n_class, dropout_rate) optimizer = optim.Adam(model.parameters(), lr=learning_rate) def train(epoch): t = time.time() model.train() optimizer.zero_grad() output, _ = model(features, adj) loss_train = F.nll_loss(output[idx_train], y[idx_train]) acc_train = accuracy(output[idx_train], y[idx_train]) loss_train.backward() optimizer.step() model.eval() output, _ = model(features, adj)
n_hidden_2 = 32 n_hidden_3 = 32 learning_rate = 0.01 # Generates synthetic dataset Gs, y = create_dataset() n_class = np.unique(y).size # Splits the dataset into a training and a test set G_train, G_test, y_train, y_test = train_test_split(Gs, y, test_size=0.1) N_train = len(G_train) N_test = len(G_test) # Initializes model and optimizer model = GNN(1, n_hidden_1, n_hidden_2, n_hidden_3, n_class, device).to(device) optimizer = optim.Adam(model.parameters(), lr=learning_rate) loss_function = nn.CrossEntropyLoss() # Trains the model for epoch in range(epochs): t = time.time() model.train() train_loss = 0 correct = 0 count = 0 for i in range(0, N_train, batch_size): adj_batch = list() idx_batch = list() y_batch = list()
class Runner: def __init__(self, params): self.params = params self.postfix = time.strftime('%d_%m_%Y') + '_' + time.strftime( '%H:%M:%S') self.prj_path = Path(__file__).parent.resolve() self.device = torch.device('cpu' if self.params.gpu == -1 else f'cuda:{params.gpu}') if self.params.evaluate: self.total_cell, self.num_genes, self.num_classes, self.id2label, self.test_dict, self.map_dict, self.time = load_data( params) else: self.total_cell, self.num_genes, self.num_classes, self.id2label, self.test_dict, self.time = load_data( params) """ test_dict = { 'graph': test_graph_dict, 'nid': test_index_dict, 'mask': test_mask_dict """ self.model = GNN(in_feats=params.dense_dim, n_hidden=params.hidden_dim, n_classes=self.num_classes, n_layers=1, gene_num=self.num_genes, activation=F.relu, dropout=params.dropout) self.load_model() self.num_neighbors = self.total_cell + self.num_genes self.model.to(self.device) def run(self): for num in self.params.test_dataset: tic = time.time() if self.params.evaluate: correct, total, unsure, acc, pred = self.evaluate_test(num) print( f"{self.params.species}_{self.params.tissue} #{num} Test Acc: {acc:.4f} ({correct}/{total}), Number of Unsure Cells: {unsure}" ) else: pred = self.inference(num) toc = time.time() print( f'{self.params.species}_{self.params.tissue} #{num} Time Consumed: {toc - tic + self.time:.2f} seconds.' ) self.save_pred(num, pred) def load_model(self): model_path = self.prj_path / 'pretrained' / self.params.species / 'models' / f'{self.params.species}-{self.params.tissue}.pt' state = torch.load(model_path, map_location=self.device) self.model.load_state_dict(state['model']) def inference(self, num): self.model.eval() new_logits = torch.zeros( (self.test_dict['graph'][num].number_of_nodes(), self.num_classes)) for nf in NeighborSampler(g=self.test_dict['graph'][num], batch_size=self.params.batch_size, expand_factor=self.total_cell + self.num_genes, num_hops=1, neighbor_type='in', shuffle=False, num_workers=8, seed_nodes=self.test_dict['nid'][num]): nf.copy_from_parent( ) # Copy node/edge features from the parent graph. with torch.no_grad(): logits = self.model(nf).cpu() batch_nids = nf.layer_parent_nid(-1).type(torch.long) new_logits[batch_nids] = logits new_logits = new_logits[self.test_dict['mask'][num]] new_logits = nn.functional.softmax(new_logits, dim=1).numpy() predict_label = [] for pred in new_logits: pred_label = self.id2label[pred.argmax().item()] if pred.max().item() < self.params.unsure_rate / self.num_classes: # unsure predict_label.append('unsure') else: predict_label.append(pred_label) return predict_label def evaluate_test(self, num): self.model.eval() new_logits = torch.zeros( (self.test_dict['graph'][num].number_of_nodes(), self.num_classes)) for nf in NeighborSampler(g=self.test_dict['graph'][num], batch_size=self.params.batch_size, expand_factor=self.total_cell + self.num_genes, num_hops=1, neighbor_type='in', shuffle=False, num_workers=8, seed_nodes=self.test_dict['nid'][num]): nf.copy_from_parent( ) # Copy node/edge features from the parent graph. with torch.no_grad(): logits = self.model(nf).cpu() batch_nids = nf.layer_parent_nid(-1).type(torch.long) new_logits[batch_nids] = logits new_logits = new_logits[self.test_dict['mask'][num]] new_logits = nn.functional.softmax(new_logits, dim=1).numpy() total = new_logits.shape[0] unsure_num, correct = 0, 0 predict_label = [] for pred, t_label in zip(new_logits, self.test_dict['label'][num]): pred_label = self.id2label[pred.argmax().item()] if pred.max().item() < self.params.unsure_rate / self.num_classes: # unsure unsure_num += 1 predict_label.append('unsure') else: if pred_label in self.map_dict[num][t_label]: correct += 1 predict_label.append(pred_label) return correct, total, unsure_num, correct / total, predict_label def save_pred(self, num, pred): label_map = pd.read_excel( './map/celltype2subtype.xlsx', sheet_name=self.params.species, header=0, names=['species', 'old_type', 'new_type', 'new_subtype']) label_map = label_map.fillna('N/A', inplace=False) oldtype2newtype = {} oldtype2newsubtype = {} for _, old_type, new_type, new_subtype in label_map.itertuples( index=False): oldtype2newtype[old_type] = new_type oldtype2newsubtype[old_type] = new_subtype save_path = self.prj_path / self.params.save_dir if not save_path.exists(): save_path.mkdir() if self.params.evaluate: df = pd.DataFrame({ 'index': self.test_dict['origin_id'][num], 'original label': self.test_dict['label'][num], 'cell_type': [oldtype2newtype.get(p, p) for p in pred], 'cell_subtype': [oldtype2newsubtype.get(p, p) for p in pred] }) else: df = pd.DataFrame({ 'index': self.test_dict['origin_id'][num], 'cell_type': [oldtype2newtype.get(p, p) for p in pred], 'cell_subtype': [oldtype2newsubtype.get(p, p) for p in pred] }) df.to_csv(save_path / (self.params.species + f"_{self.params.tissue}_{num}.csv"), index=False) print( f"output has been stored in {self.params.species}_{self.params.tissue}_{num}.csv" )
valid_dataloader, optm=optm, learning_rate=learning_rate, patience=5) test_result = TestModel(model, test_dataloader, max_speed) StoreData(model_name, train_result, test_result, directory, model, random_seed, save_model) # GNN-6 importlib.reload(models) from models import GNN importlib.reload(utils) from utils import TrainModel, TestModel model_name = 'GMN6' print(model_name) model = GNN(A, layer=6, gamma=gamma) model, train_result = TrainModel(model, train_dataloader, valid_dataloader, optm=optm, learning_rate=learning_rate, patience=5) test_result = TestModel(model, test_dataloader, max_speed) StoreData(model_name, train_result, test_result, directory, model, random_seed, save_model) # GNN-8 importlib.reload(models) from models import GNN importlib.reload(utils) from utils import TrainModel, TestModel
def main(_run, _config, _log): ''' _config: dictionary; its keys and values are the variables setting in the cfg function _run: run object defined by Sacred, can be used to record hashable values and get some information, e.g. run id, for a run _log: logger object provided by Sacred, but is not very flexible, we can define loggers by oureselves ''' config = dcopy( _config ) # We need this step because Sacred does not allow us to change _config object # But sometimes we need to add some key-value pairs to config torch.cuda.set_device(config['gpu_id']) save_source(_run) # Source code are saved by running this line init_seed(config['seed']) logger = init_logger(log_root=_run.observers[0].dir, file_name='log.txt') output_folder_path = opjoin(_run.observers[0].dir, config['path']['output_folder_name']) os.makedirs(output_folder_path, exist_ok=True) best_acc_list = [] last_acc_list = [] train_best_list = [] train_last_list = [] best_epoch = [] data = load_data(config=config) split_iterator = range(config['data']['random_split']['num_splits']) \ if config['data']['random_split']['use'] \ else range(1) config['adj'] = data[0] for i in split_iterator: output_folder = opjoin(output_folder_path, str(i)) os.makedirs(output_folder, exist_ok=True) if config['data']['random_split']['use']: data = resplit( dataset=config['data']['dataset'], data=data, full_sup=config['data']['full_sup'], num_classes=torch.unique(data[2]).shape[0], num_nodes=data[1].shape[0], num_per_class=config['data']['label_per_class'], ) print(torch.sum(data[3])) model = GNN(config=config) if i == 0: logger.info(model) if config['use_gpu']: model.cuda() data = [ each.cuda() if hasattr(each, 'cuda') else each for each in data ] optimizer = init_optimizer( params=model.parameters(), optim_type=config['optim']['type'], lr=config['optim']['lr'], weight_decay=config['optim']['weight_decay'], momentum=config['optim']['momemtum']) criterion = nn.NLLLoss() best_model_path = opjoin(output_folder, 'best_model.pth') last_model_path = opjoin(output_folder, 'last_model.pth') best_dict_path = opjoin(output_folder, 'best_pred_dict.pkl') last_dict_path = opjoin(output_folder, 'last_pred_dict.pkl') losses_curve_path = opjoin(output_folder, 'losses.pkl') accs_curve_path = opjoin(output_folder, 'accs.pkl') best_state_path = opjoin(output_folder, 'best_state.pkl') grads_path = opjoin(output_folder, 'grads.pkl') best_pred_dict, last_pred_dict, train_losses, train_accs, \ val_losses, val_accs, best_state, grads, model_state = train(best_model_path, last_model_path, config, criterion, data, logger, model, optimizer ) last_model_state, best_model_state = model_state losses_dict = {'train': train_losses, 'val': val_losses} accs_dict = {'train': train_accs, 'val': val_accs} logger.info(f'split_seed: {i: 04d}') logger.info(f'Test set results on the last model:') last_pred_dict = test( criterion, data, last_model_path, last_pred_dict, logger, model, last_model_state, ) logger.info(f'Test set results on the best model:') if config['fastmode']: best_pred_dict = last_pred_dict else: best_pred_dict = test( criterion, data, best_model_path, best_pred_dict, logger, model, best_model_state, ) logger.info('\n') check_before_pkl(best_pred_dict) with open(best_dict_path, 'wb') as f: pkl.dump(best_pred_dict, f) check_before_pkl(last_pred_dict) with open(last_dict_path, 'wb') as f: pkl.dump(last_pred_dict, f) check_before_pkl(losses_dict) with open(losses_curve_path, 'wb') as f: pkl.dump(losses_dict, f) check_before_pkl(accs_dict) with open(accs_curve_path, 'wb') as f: pkl.dump(accs_dict, f) check_before_pkl(best_state) with open(best_state_path, 'wb') as f: pkl.dump(best_state, f) check_before_pkl(grads) with open(grads_path, 'wb') as f: pkl.dump(grads, f) best_acc_list.append(best_pred_dict['test acc'].item()) last_acc_list.append(last_pred_dict['test acc'].item()) train_best_list.append(best_state['train acc'].item()) train_last_list.append(train_accs[-1].item()) best_epoch.append(best_state['epoch']) logger.info('********************* STATISTICS *********************') np.set_printoptions(precision=4, suppress=True) logger.info(f"\n" f"Best test acc: {best_acc_list}\n" f"Mean: {np.mean(best_acc_list)}\t" f"Std: {np.std(best_acc_list)}\n" f"Last test acc: {last_acc_list}\n" f"Mean: {np.mean(last_acc_list)}\t" f"Std: {np.std(last_acc_list)}\n") logger.info(f"best epoch: {best_epoch}")