def main(): if not os.path.exists(args.save_dir): os.makedirs(os.path.join(save_dir, 'zinc250k.png')) model = VGAE(args.in_dim, args.hidden_dims, zdim=16, device=device) model.to(device) print('Loading data') with open(args.data_file, 'rb') as f: graphs = dill.load(f) print('Loaded {} molecules'.format(len(graphs))) train_graphs, val_graphs = train_test_split(graphs, test_size=10000) train_dataset = MolDataset(train_graphs) val_dataset = MolDataset(val_graphs) del train_graphs, val_graphs train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, collate_fn=collate) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, collate_fn=collate) trainer = Trainer(model, args) train_losses, val_losses = [], [] train_loss = 0 print('Training Start') t = trange(args.n_epochs, desc="Loss: 0.0", leave=True) for epoch in t: t.set_description("Loss: {}".format(train_loss)) t.refresh() train_loss = 0 model.train() for bg in tqdm(train_loader): bg.set_e_initializer(dgl.init.zero_initializer) bg.set_n_initializer(dgl.init.zero_initializer) train_loss += trainer.iteration(bg) train_loss /= len(train_loader) train_losses.append(train_loss) trainer.save(epoch, args.save_dir) val_loss = 0 model.eval() for bg in val_loader: bg.set_e_initializer(dgl.init.zero_initializer) bg.set_n_initializer(dgl.init.zero_initializer) val_loss += trainer.iteration(bg, train=False) val_loss /= len(val_loader) val_losses.append(val_loss) #print('Epoch: {:02d} | Train Loss: {:.4f} | Validation Loss: {:.4f}'.format(epoch, train_loss, val_loss)) plot(train_losses, val_losses)
def infer(args): log.info("loading data") raw_dataset = GraphPropPredDataset(name=args.dataset_name) args.num_class = raw_dataset.num_tasks args.eval_metric = raw_dataset.eval_metric args.task_type = raw_dataset.task_type test_ds = MolDataset(args, raw_dataset, mode="test") fn = MgfCollateFn(args, mode="test") test_loader = Dataloader(test_ds, batch_size=args.batch_size, num_workers=1, collate_fn=fn) test_loader = PDataset.from_generator_func(test_loader) est = propeller.Learner(MgfModel, args, args.model_config) mgf_list = [] for soft_mgf in est.predict(test_loader, ckpt_path=args.model_path_for_infer, split_batch=True): mgf_list.append(soft_mgf) mgf = np.concatenate(mgf_list) log.info("saving features") np.save( "dataset/%s/soft_mgf_feat.npy" % (args.dataset_name.replace("-", "_")), mgf)
def get_dataset_dataloader(train_keys, test_keys, data_dir, id_to_y, batch_size, num_workers, pos_noise_std): from torch.utils.data import DataLoader from dataset import MolDataset, tensor_collate_fn train_dataset = MolDataset(train_keys, data_dir, id_to_y, pos_noise_std=pos_noise_std) train_dataloader = DataLoader(train_dataset, batch_size, num_workers=num_workers, collate_fn=tensor_collate_fn, shuffle=True) test_dataset = MolDataset(test_keys, data_dir, id_to_y) test_dataloader = DataLoader(test_dataset, batch_size, num_workers=num_workers, collate_fn=tensor_collate_fn, shuffle=False) return train_dataset, train_dataloader, test_dataset, test_dataloader
print(f"No {args.potential} potential") exit(-1) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = utils.initialize_model(model, device, args.restart_file) print(f"vina_hbond_coeff: {model.vina_hbond_coeff.data.cpu().numpy()[0]:.3f}") print(f"vina_hydrophobic_coeff: \ {model.vina_hydrophobic_coeff.data.cpu().numpy()[0]:.3f}") print(f"rotor_coeff: {model.rotor_coeff.data.cpu().numpy()[0]:.3f}") print(f"vdw_coeff: {model.vdw_coeff.data.cpu().numpy()[0]:.3f}") # exit(-1) print("number of parameters : ", sum(p.numel() for p in model.parameters() if p.requires_grad)) # Dataloader test_dataset = MolDataset(test_keys, args.data_dir, id_to_y) test_data_loader = DataLoader(test_dataset, args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=tensor_collate_fn) # test st = time.time() test_losses1 = [] test_losses2 = [] test_pred = dict() test_true = dict()
from rdkit import Chem from torch import nn from torch.utils.data import DataLoader from tqdm import tqdm from widis_lstm_tools.preprocessing import random_dataset_split, inds_to_one_hot from dataset import MolDataset from model import MoLSTM from train import train # device device = 'cuda:0' if torch.cuda.is_available() else 'cpu' # load dataset filepath = r'results\fifth_submission.txt' data = MolDataset(filepath) # data splitting train_data, test_data = random_dataset_split(data, split_sizes=(90 / 100., 10 / 100)) # data loader train_loader = DataLoader(train_data, batch_size=128, shuffle=True) test_loader = DataLoader(test_data, batch_size=64, shuffle=True) # model model = MoLSTM(n_inputs=len(data.id2char), hidden_size=128) model = model.to(device) criterion = nn.CrossEntropyLoss() # Optimizer, scheduler
#print simple statistics about dude data and pdbbind data print(f'Number of train data: {len(train_keys)}') print(f'Number of test data: {len(test_keys)}') #initialize model if args.ngpu > 0: cmd = utils.set_cuda_visible_device(args.ngpu) os.environ['CUDA_VISIBLE_DEVICES'] = cmd[:-1] model = gnn(args) print('number of parameters : ', sum(p.numel() for p in model.parameters() if p.requires_grad)) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = utils.initialize_model(model, device) #train and test dataset train_dataset = MolDataset(train_keys, args.dude_data_fpath) test_dataset = MolDataset(test_keys, args.dude_data_fpath) num_train_chembl = len([0 for k in train_keys if 'CHEMBL' in k]) num_train_decoy = len([0 for k in train_keys if 'CHEMBL' not in k]) train_weights = [ 1 / num_train_chembl if 'CHEMBL' in k else 1 / num_train_decoy for k in train_keys ] train_sampler = DTISampler(train_weights, len(train_weights), replacement=True) train_dataloader = DataLoader(train_dataset, args.batch_size, \ shuffle=False, num_workers = args.num_workers, collate_fn=collate_fn,\ sampler = train_sampler) test_dataloader = DataLoader(test_dataset, args.batch_size, \ shuffle=False, num_workers = args.num_workers, collate_fn=collate_fn) #optimizer
def main(): # Training settings parser = argparse.ArgumentParser( description='GNN baselines on pcqm4m with PGL') parser.add_argument('--use_cuda', action='store_true') parser.add_argument('--device', type=int, default=0, help='which gpu to use if any (default: 0)') parser.add_argument( '--gnn', type=str, default='gin-virtual', help= 'GNN gin, gin-virtual, or gcn, or gcn-virtual (default: gin-virtual)') parser.add_argument( '--graph_pooling', type=str, default='sum', help='graph pooling strategy mean or sum (default: sum)') parser.add_argument('--drop_ratio', type=float, default=0, help='dropout ratio (default: 0)') parser.add_argument( '--num_layers', type=int, default=5, help='number of GNN message passing layers (default: 5)') parser.add_argument( '--emb_dim', type=int, default=600, help='dimensionality of hidden units in GNNs (default: 600)') parser.add_argument('--train_subset', action='store_true') parser.add_argument('--batch_size', type=int, default=256, help='input batch size for training (default: 256)') parser.add_argument('--epochs', type=int, default=100, help='number of epochs to train (default: 100)') parser.add_argument('--num_workers', type=int, default=1, help='number of workers (default: 1)') parser.add_argument('--log_dir', type=str, default="", help='tensorboard log directory') parser.add_argument('--checkpoint_dir', type=str, default='', help='directory to save checkpoint') parser.add_argument('--save_test_dir', type=str, default='', help='directory to save test submission file') args = parser.parse_args() print(args) random.seed(42) np.random.seed(42) paddle.seed(42) if not args.use_cuda: paddle.set_device("cpu") ### automatic dataloading and splitting class Config(): def __init__(self): self.base_data_path = "./dataset" config = Config() ds = MolDataset(config) split_idx = ds.get_idx_split() test_ds = Subset(ds, split_idx['test']) print("Test exapmles: ", len(test_ds)) ### automatic evaluator. takes dataset name as input evaluator = PCQM4MEvaluator() test_loader = Dataloader(test_ds, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=CollateFn()) shared_params = { 'num_layers': args.num_layers, 'emb_dim': args.emb_dim, 'drop_ratio': args.drop_ratio, 'graph_pooling': args.graph_pooling } if args.gnn == 'gin': model = GNN(gnn_type='gin', virtual_node=False, **shared_params) elif args.gnn == 'gin-virtual': model = GNN(gnn_type='gin', virtual_node=True, **shared_params) elif args.gnn == 'gcn': model = GNN(gnn_type='gcn', virtual_node=False, **shared_params) elif args.gnn == 'gcn-virtual': model = GNN(gnn_type='gcn', virtual_node=True, **shared_params) else: raise ValueError('Invalid GNN type') num_params = sum(p.numel() for p in model.parameters()) print(f'#Params: {num_params}') checkpoint_path = os.path.join(args.checkpoint_dir, 'checkpoint.pdparams') if not os.path.exists(checkpoint_path): raise RuntimeError(f'Checkpoint file not found at {checkpoint_path}') model.set_state_dict(paddle.load(checkpoint_path)) print('Predicting on test data...') y_pred = test(model, test_loader) print('Saving test submission file...') evaluator.save_test_submission({'y_pred': y_pred}, args.save_test_dir)
if args.ngpu>0: cmd = utils.set_cuda_visible_device(args.ngpu) os.environ['CUDA_VISIBLE_DEVICES']=cmd[:-1] device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") c_to_i = pickle.load(open(args.c_to_i, 'rb')) i_to_c = pickle.load(open(args.i_to_c, 'rb')) n_char = len(c_to_i) dataloaders = [] for fn in args.filenames: with open(fn) as f: lines = f.readlines() lines = [s.strip().split()[1] for s in lines] test_dataset = MolDataset(lines, c_to_i) test_dataloader = DataLoader(test_dataset, args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=my_collate) dataloaders.append(test_dataloader) model = model.RNN(args.n_feature, args.n_feature, n_char, args.n_layer, i_to_c) model = utils.initialize_model(model, device, args.save_files) print("number of parameters :", sum(p.numel() for p in model.parameters() if p.requires_grad)) model.eval() for fn,dataloader in zip(args.filenames, dataloaders): log_likelihoods = [] for i_batch, sample in enumerate(dataloader) : x, l = sample['X'].to(device).long(), sample['L'].long().data.cpu().numpy()
def main(): # Training settings parser = argparse.ArgumentParser( description='GNN baselines on pcqm4m with PGL') parser.add_argument('--use_cuda', action='store_true') parser.add_argument('--device', type=int, default=0, help='which gpu to use if any (default: 0)') parser.add_argument( '--gnn', type=str, default='gin-virtual', help= 'GNN gin, gin-virtual, or gcn, or gcn-virtual (default: gin-virtual)') parser.add_argument( '--graph_pooling', type=str, default='sum', help='graph pooling strategy mean or sum (default: sum)') parser.add_argument('--drop_ratio', type=float, default=0, help='dropout ratio (default: 0)') parser.add_argument( '--num_layers', type=int, default=5, help='number of GNN message passing layers (default: 5)') parser.add_argument( '--emb_dim', type=int, default=600, help='dimensionality of hidden units in GNNs (default: 600)') parser.add_argument('--train_subset', action='store_true') parser.add_argument('--batch_size', type=int, default=256, help='input batch size for training (default: 256)') parser.add_argument('--epochs', type=int, default=100, help='number of epochs to train (default: 100)') parser.add_argument('--num_workers', type=int, default=1, help='number of workers (default: 1)') parser.add_argument('--log_dir', type=str, default="", help='tensorboard log directory') parser.add_argument('--checkpoint_dir', type=str, default='', help='directory to save checkpoint') parser.add_argument('--save_test_dir', type=str, default='', help='directory to save test submission file') args = parser.parse_args() print(args) random.seed(42) np.random.seed(42) paddle.seed(42) if not args.use_cuda: paddle.set_device("cpu") ### automatic dataloading and splitting class Config(): def __init__(self): self.base_data_path = "./dataset" config = Config() ds = MolDataset(config) split_idx = ds.get_idx_split() train_ds = Subset(ds, split_idx['train']) valid_ds = Subset(ds, split_idx['valid']) test_ds = Subset(ds, split_idx['test']) print("Train exapmles: ", len(train_ds)) print("Valid exapmles: ", len(valid_ds)) print("Test exapmles: ", len(test_ds)) ### automatic evaluator. takes dataset name as input evaluator = PCQM4MEvaluator() train_loader = Dataloader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=CollateFn()) valid_loader = Dataloader(valid_ds, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=CollateFn()) if args.save_test_dir is not '': test_loader = Dataloader(test_ds, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=CollateFn()) if args.checkpoint_dir is not '': os.makedirs(args.checkpoint_dir, exist_ok=True) shared_params = { 'num_layers': args.num_layers, 'emb_dim': args.emb_dim, 'drop_ratio': args.drop_ratio, 'graph_pooling': args.graph_pooling } if args.gnn == 'gin': model = GNN(gnn_type='gin', virtual_node=False, **shared_params) elif args.gnn == 'gin-virtual': model = GNN(gnn_type='gin', virtual_node=True, **shared_params) elif args.gnn == 'gcn': model = GNN(gnn_type='gcn', virtual_node=False, **shared_params) elif args.gnn == 'gcn-virtual': model = GNN(gnn_type='gcn', virtual_node=True, **shared_params) else: raise ValueError('Invalid GNN type') num_params = sum(p.numel() for p in model.parameters()) print(f'#Params: {num_params}') if args.log_dir is not '': writer = SummaryWriter(log_dir=args.log_dir) best_valid_mae = 1000 scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.001, step_size=300, gamma=0.25) optimizer = paddle.optimizer.Adam(learning_rate=scheduler, parameters=model.parameters()) msg = "ogbg_lsc_paddle_baseline\n" for epoch in range(1, args.epochs + 1): print("=====Epoch {}".format(epoch)) print('Training...') train_mae = train(model, train_loader, optimizer) print('Evaluating...') valid_mae = eval(model, valid_loader, evaluator) print({'Train': train_mae, 'Validation': valid_mae}) if args.log_dir is not '': writer.add_scalar('valid/mae', valid_mae, epoch) writer.add_scalar('train/mae', train_mae, epoch) if valid_mae < best_valid_mae: best_valid_mae = valid_mae if args.checkpoint_dir is not '': print('Saving checkpoint...') paddle.save( model.state_dict(), os.path.join(args.checkpoint_dir, 'checkpoint.pdparams')) if args.save_test_dir is not '': print('Predicting on test data...') y_pred = test(model, test_loader) print('Saving test submission file...') evaluator.save_test_submission({'y_pred': y_pred}, args.save_test_dir) scheduler.step() print(f'Best validation MAE so far: {best_valid_mae}') try: msg +="Epoch: %d | Train: %.6f | Valid: %.6f | Best Valid: %.6f\n" \ % (epoch, train_mae, valid_mae, best_valid_mae) print(msg) except: continue if args.log_dir is not '': writer.close()
def main(): now = time.localtime() s = "%04d-%02d-%02d %02d:%02d:%02d" % (now.tm_year, now.tm_mon, now.tm_mday, now.tm_hour, now.tm_min, now.tm_sec) print(s) parser = argparse.ArgumentParser() parser.add_argument("--lr", help="learning rate", type=float, default=0.0001) parser.add_argument("--epoch", help="epoch", type=int, default=10000) parser.add_argument("--ngpu", help="number of gpu", type=int, default=1) parser.add_argument("--batch_size", help="batch_size", type=int, default=32) parser.add_argument("--num_workers", help="number of workers", type=int, default=7) parser.add_argument("--n_graph_layer", help="number of GNN layer", type=int, default=4) parser.add_argument("--d_graph_layer", help="dimension of GNN layer", type=int, default=140) parser.add_argument("--n_FC_layer", help="number of FC layer", type=int, default=4) parser.add_argument("--d_FC_layer", help="dimension of FC layer", type=int, default=128) parser.add_argument("--dude_data_fpath", help="file path of dude data", type=str, default='data/') parser.add_argument("--save_dir", help="save directory of model parameter", type=str, default='./save/') parser.add_argument("--initial_mu", help="initial value of mu", type=float, default=4.0) parser.add_argument("--initial_dev", help="initial value of dev", type=float, default=1.0) parser.add_argument("--dropout_rate", help="dropout_rate", type=float, default=0.0) parser.add_argument("--train_keys", help="train keys", type=str, default='keys/train_keys.pkl') parser.add_argument("--test_keys", help="test keys", type=str, default='keys/test_keys.pkl') args = parser.parse_args() print(args) #hyper parameters num_epochs = args.epoch lr = args.lr ngpu = args.ngpu batch_size = args.batch_size dude_data_fpath = args.dude_data_fpath save_dir = args.save_dir #make save dir if it doesn't exist if not os.path.isdir(save_dir): os.system('mkdir ' + save_dir) print('save_dir({}) created'.format(save_dir)) pass print('save_dir:{}'.format(save_dir)) print('+' * 10) #read data. data is stored in format of dictionary. Each key has information about protein-ligand complex. with open(args.train_keys, 'rb') as fp: train_keys = pickle.load(fp) # # train_keys: type=list, len=730, ['andr_C36276925', 'dhi1_C08592133', 'hivpr_C59233791', 'hivrt_C66397637', 'cah2_C62892628', ... ] # print('train_keys({}) loaded from pickle --> type:{}, len:{}, ex:\n{}'. format(args.train_keys, type(train_keys), len(train_keys), train_keys[:5])) pass print('+' * 3) with open(args.test_keys, 'rb') as fp: test_keys = pickle.load(fp) # # test_keys: type=list, len=255, ['fnta_C59365794', 'ace_C22923016', 'aces_C21842010', 'kith_C11223989', 'kpcb_C37928874', ... ] # print('test_keys({}) loaded from pickle --> type:{}, len:{}, ex:\n{}'. format(args.test_keys, type(test_keys), len(test_keys), test_keys[:5])) pass print('+' * 10) #print simple statistics about dude data and pdbbind data print(f'Number of train data: {len(train_keys)}') print(f'Number of test data: {len(test_keys)}') if 0 < args.ngpu: cmd = utils.set_cuda_visible_device(args.ngpu) print('utils.set_cuda_visible_device({}) --> cmd:{}'.format( args.ngpu, cmd)) os.environ['CUDA_VISIBLE_DEVICES'] = cmd[:-1] pass model = gnn(args) print('+' * 10) print('number of parameters : ', sum(p.numel() for p in model.parameters() if p.requires_grad)) #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") device = torch.device( "cuda:0" if torch.cuda.is_available() and 0 < args.ngpu else "cpu") print('device: {}'.format(device)) # initialize model model = utils.initialize_model(model, device) print('#' * 80) print('dude_data_fpath:{}'.format(args.dude_data_fpath)) #train and test dataset train_dataset = MolDataset(train_keys, args.dude_data_fpath) test_dataset = MolDataset(test_keys, args.dude_data_fpath) print('#' * 80) num_train_chembl = len([0 for k in train_keys if 'CHEMBL' in k]) num_train_decoy = len([0 for k in train_keys if 'CHEMBL' not in k]) print('#1:num_train_chembl:{}, num_train_decoy:{}'.format( num_train_chembl, num_train_decoy)) num_train_chembl = len([0 for k in train_keys if 'CHEMBL' in k]) num_train_decoy = len(train_keys) - num_train_chembl print('#2:num_train_chembl:{}, num_train_decoy:{}'.format( num_train_chembl, num_train_decoy)) #train_weights = [1/num_train_chembl if 'CHEMBL' in k else 1/num_train_decoy for k in train_keys] train_weight_chembl = 1.0 / num_train_chembl train_weight_decoy = 1.0 / num_train_decoy train_weights = [ train_weight_chembl if 'CHEMBL' in k else train_weight_decoy for k in train_keys ] print('main: sum(train_weights):{}'.format(sum(train_weights))) print( 'train_weight_chembl:{} / train_weight_decoy:{}, len(train_weights):{}' .format(train_weight_chembl, train_weight_decoy, len(train_weights))) train_sampler = DTISampler(train_weights, len(train_weights), replacement=True) print('main: args.batch_size:{}, args.num_workers:{}'.format( args.batch_size, args.num_workers)) # # train_dataset: object of MolDataset(torch.utils.data.Dataset) # train_dataloader = DataLoader(train_dataset, args.batch_size, \ shuffle=False, num_workers = args.num_workers, collate_fn=collate_fn,\ sampler = train_sampler) # # test_dataset: object of MolDataset(torch.utils.data.Dataset) # test_dataloader = DataLoader(test_dataset, args.batch_size, \ shuffle=False, num_workers = args.num_workers, collate_fn=collate_fn, \ ) #optimizer #optimizer = torch.optim.Adam(model.parameters(), lr=lr) optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5) #loss function --> BCELoss (Binary Classification Error) #loss_fn = nn.BCELoss() loss_fn = nn.CrossEntropyLoss() test_roc_list = list() best_test_roc = 0.0 for epoch in range(num_epochs): st = time.time() #collect losses of each iteration train_losses = [] test_losses = [] #collect true label of each iteration train_true = [] test_true = [] #collect predicted label of each iteration train_pred = [] test_pred = [] model.train() # sets the model in training mode. #print('model.training:{}'.format(model.training)) for i_batch, sample in enumerate(train_dataloader): model.zero_grad() H, A1, A2, Y, V, keys = sample n_queried, n_max_n1, n_max_n2, n_max_adj, n_file_opened = train_dataset.get_n_queried( ) if epoch == 0 and i_batch == 0: print('#1:{}/{} H:type:{}, shape:{}\n{}'.format( i_batch, epoch, type(H), H.shape, H)) print(' A1:type:{}, shape:{}\n{}'.format( type(A1), A1.shape, A1)) print(' A2:type:{}, shape:{}\n{}'.format( type(A2), A2.shape, A2)) print(' Y:type:{}, shape:{}\n{}'.format( type(Y), Y.shape, Y)) print(' V:type:{}, shape:{}\n{}'.format( type(V), V.shape, V)) print(' keys:type:{}\n{}'.format(type(keys), keys)) print( ' train_dataset: n_queried:{}, n_max_n1:{}, n_max_n2:{}, n_max_adj:{}, n_file_opened:{}' .format(n_queried, n_max_n1, n_max_n2, n_max_adj, n_file_opened)) print('+' * 10) pass H, A1, A2, Y, V = H.to(device), A1.to(device), A2.to(device),\ Y.to(device), V.to(device) if epoch == 0 and i_batch == 0: print('#2:{}/{} H:type:{}, shape:{}\n{}'.format( i_batch, epoch, type(H), H.shape, H)) print(' A1:type:{}, shape:{}\n{}'.format( type(A1), A1.shape, A1)) print(' A2:type:{}, shape:{}\n{}'.format( type(A2), A2.shape, A2)) print(' Y:type:{}, shape:{}\n{}'.format( type(Y), Y.shape, Y)) print(' V:type:{}, shape:{}\n{}'.format( type(V), V.shape, V)) print(' keys:type:{}\n{}'.format(type(keys), keys)) print( ' train_dataset: n_queried:{}, n_max_n1:{}, n_max_n2:{}, n_max_adj:{}, n_file_opened:{}' .format(n_queried, n_max_n1, n_max_n2, n_max_adj, n_file_opened)) print('+' * 10) pass #train neural network pred = model.train_model((H, A1, A2, V)) #pred = model.module.train_model((H, A1, A2, V)) pred = pred.cpu() pred_softmax = pred.detach().numpy() pred_softmax = softmax(pred_softmax, axis=1)[:, 1] if epoch == 0 and i_batch == 0: print('{}/{} pred:shape:{}\n{}\nY.shape:{}'.format( i_batch, epoch, pred.shape, pred, Y.shape)) print('+' * 10) print('{}/{} pred_softmax:shape:{}\n{}'.format( i_batch, epoch, pred_softmax.shape, pred_softmax)) print('+' * 10) pass loss = loss_fn(pred, Y) if epoch == 0 and i_batch == 0: print('{}/{} loss:shape:{}\n{}'.format(i_batch, epoch, loss.shape, loss)) print('+' * 10) pass loss.backward() optimizer.step() #collect loss, true label and predicted label train_losses.append(loss.data.cpu().numpy()) train_true.append(Y.data.cpu().numpy()) #train_pred.append(pred.data.cpu().numpy()) train_pred.append(pred_softmax) #if i_batch>10 : break pass # end of for i_batch,sample model.eval() # equivalent with model.train(mode=False) for i_batch, sample in enumerate(test_dataloader): model.zero_grad() H, A1, A2, Y, V, keys = sample H, A1, A2, Y, V = H.to(device), A1.to(device), A2.to(device),\ Y.to(device), V.to(device) #train neural network pred = model.train_model((H, A1, A2, V)) pred_softmax = pred.detach().numpy() pred_softmax = softmax(pred_softmax, axis=1)[:, 1] loss = loss_fn(pred, Y) #collect loss, true label and predicted label test_losses.append(loss.data.cpu().numpy()) test_true.append(Y.data.cpu().numpy()) #test_pred.append(pred.data.cpu().numpy()) test_pred.append(pred_softmax) #if i_batch>10 : break if epoch == 0 and i_batch == 0: print('eval: Y.shape:{}, pred.shape:{}, pred_softmax.shape:{}'. format(Y.shape, pred.shape, pred_softmax.shape)) pass pass train_losses = np.mean(np.array(train_losses)) test_losses = np.mean(np.array(test_losses)) train_pred = np.concatenate(np.array(train_pred), 0) test_pred = np.concatenate(np.array(test_pred), 0) train_true = np.concatenate(np.array(train_true), 0) test_true = np.concatenate(np.array(test_true), 0) #print('#' * 80) #print('train_pred:\n{}'.format(train_pred)) #print('+' * 7) ##print(softmax(train_pred, axis=1)) #print('+' * 10) #print('+' * 10) #print('train_true:\n{}'.format(train_true)) #print('#' * 80, flush=True) train_roc = roc_auc_score(train_true, train_pred) test_roc = roc_auc_score(test_true, test_pred) end = time.time() if epoch == 0: print( 'epoch\ttrain_losses\ttest_losses\ttrain_roc\ttest_roc\telapsed_time' ) pass #print('#' * 80) #print ('epoch\ttrain_losses\ttest_losses\ttrain_roc\ttest_roc\telapsed_time') #print ("%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f" \ print ('%s\t%.6f\t%.6f\t%.6f\t%.6f\t%.6f\t%s' \ % (epoch, train_losses, test_losses, train_roc, test_roc, end-st, datetime.datetime.fromtimestamp(end).strftime('%Y-%m-%d %H:%M:%S.%f')), end='') #name = save_dir + '/save_'+str(epoch)+'.pt' #torch.save(model.state_dict(), name) if best_test_roc < test_roc: name = save_dir + '/save_' + str(epoch) + '.pt' torch.save(model.state_dict(), name) print(' updated') best_test_roc = test_roc pass else: print('') pass test_roc_list.append(test_roc) pass pass
def main(config): if dist.get_world_size() > 1: dist.init_parallel_env() if dist.get_rank() == 0: timestamp = datetime.now().strftime("%Hh%Mm%Ss") log_path = os.path.join(config.log_dir, "tensorboard_log_%s" % timestamp) writer = SummaryWriter(log_path) log.info("loading data") raw_dataset = GraphPropPredDataset(name=config.dataset_name) config.num_class = raw_dataset.num_tasks config.eval_metric = raw_dataset.eval_metric config.task_type = raw_dataset.task_type mol_dataset = MolDataset(config, raw_dataset, transform=make_multihop_edges) splitted_index = raw_dataset.get_idx_split() train_ds = Subset(mol_dataset, splitted_index['train'], mode='train') valid_ds = Subset(mol_dataset, splitted_index['valid'], mode="valid") test_ds = Subset(mol_dataset, splitted_index['test'], mode="test") log.info("Train Examples: %s" % len(train_ds)) log.info("Val Examples: %s" % len(valid_ds)) log.info("Test Examples: %s" % len(test_ds)) fn = CollateFn(config) train_loader = Dataloader(train_ds, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers, collate_fn=fn) valid_loader = Dataloader(valid_ds, batch_size=config.batch_size, num_workers=config.num_workers, collate_fn=fn) test_loader = Dataloader(test_ds, batch_size=config.batch_size, num_workers=config.num_workers, collate_fn=fn) model = ClassifierNetwork(config.hidden_size, config.out_dim, config.num_layers, config.dropout_prob, config.virt_node, config.K, config.conv_type, config.appnp_hop, config.alpha) model = paddle.DataParallel(model) optim = Adam(learning_rate=config.lr, parameters=model.parameters()) criterion = nn.loss.BCEWithLogitsLoss() evaluator = Evaluator(config.dataset_name) best_valid = 0 global_step = 0 for epoch in range(1, config.epochs + 1): model.train() for idx, batch_data in enumerate(train_loader): g, mh_graphs, labels, unmask = batch_data g = g.tensor() multihop_graphs = [] for item in mh_graphs: multihop_graphs.append(item.tensor()) g.multi_hop_graphs = multihop_graphs labels = paddle.to_tensor(labels) unmask = paddle.to_tensor(unmask) pred = model(g) pred = paddle.masked_select(pred, unmask) labels = paddle.masked_select(labels, unmask) train_loss = criterion(pred, labels) train_loss.backward() optim.step() optim.clear_grad() if global_step % 80 == 0: message = "train: epoch %d | step %d | " % (epoch, global_step) message += "loss %.6f" % (train_loss.numpy()) log.info(message) if dist.get_rank() == 0: writer.add_scalar("loss", train_loss.numpy(), global_step) global_step += 1 valid_result = evaluate(model, valid_loader, criterion, evaluator) message = "valid: epoch %d | step %d | " % (epoch, global_step) for key, value in valid_result.items(): message += " | %s %.6f" % (key, value) if dist.get_rank() == 0: writer.add_scalar("valid_%s" % key, value, global_step) log.info(message) test_result = evaluate(model, test_loader, criterion, evaluator) message = "test: epoch %d | step %d | " % (epoch, global_step) for key, value in test_result.items(): message += " | %s %.6f" % (key, value) if dist.get_rank() == 0: writer.add_scalar("test_%s" % key, value, global_step) log.info(message) if best_valid < valid_result[config.metrics]: best_valid = valid_result[config.metrics] best_valid_result = valid_result best_test_result = test_result message = "best result: epoch %d | " % (epoch) message += "valid %s: %.6f | " % (config.metrics, best_valid_result[config.metrics]) message += "test %s: %.6f | " % (config.metrics, best_test_result[config.metrics]) log.info(message) message = "final eval best result:%.6f" % best_valid_result[config.metrics] log.info(message) message = "final test best result:%.6f" % best_test_result[config.metrics] log.info(message)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") save_dir = args.save_dir if not os.path.isdir(save_dir): os.system('mkdir ' + save_dir) c_to_i = pickle.load(open(args.c_to_i, 'rb')) i_to_c = pickle.load(open(args.i_to_c, 'rb')) n_char = len(c_to_i) print('c_to_i:', c_to_i) with open(args.train_filenames) as f: lines = f.readlines() train_lines = [s.strip().split()[1] for s in lines] train_dataset = MolDataset(train_lines, dict(c_to_i), args.enumerate_smiles, args.stereo) with open(args.test_filenames) as f: lines = f.readlines() test_lines = [s.strip().split()[1] for s in lines] test_dataset = MolDataset(test_lines, dict(c_to_i), args.enumerate_smiles, args.stereo) train_dataloader = DataLoader(train_dataset, args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=my_collate) test_dataloader = DataLoader(test_dataset, args.batch_size, shuffle=False,
def train(args, pretrained_model_config=None): log.info("loading data") raw_dataset = GraphPropPredDataset(name=args.dataset_name) args.num_class = raw_dataset.num_tasks args.eval_metric = raw_dataset.eval_metric args.task_type = raw_dataset.task_type train_ds = MolDataset(args, raw_dataset) args.eval_steps = math.ceil(len(train_ds) / args.batch_size) log.info("Total %s steps (eval_steps) every epoch." % (args.eval_steps)) fn = MgfCollateFn(args) train_loader = Dataloader(train_ds, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=args.shuffle, stream_shuffle_size=args.shuffle_size, collate_fn=fn) # for evaluating eval_train_loader = train_loader eval_train_loader = PDataset.from_generator_func(eval_train_loader) train_loader = multi_epoch_dataloader(train_loader, args.epochs) train_loader = PDataset.from_generator_func(train_loader) if args.warm_start_from is not None: # warm start setting def _fn(v): if not isinstance(v, F.framework.Parameter): return False if os.path.exists(os.path.join(args.warm_start_from, v.name)): return True else: return False ws = propeller.WarmStartSetting(predicate_fn=_fn, from_dir=args.warm_start_from) else: ws = None def cmp_fn(old, new): if old['eval'][args.metrics] - new['eval'][args.metrics] > 0: log.info("best %s eval result: %s" % (args.metrics, new['eval'])) return True else: return False if args.log_id is not None: save_best_model = int(args.log_id) == 5 else: save_best_model = True best_exporter = propeller.exporter.BestResultExporter( args.output_dir, (cmp_fn, save_best_model)) eval_datasets = {"eval": eval_train_loader} propeller.train.train_and_eval( model_class_or_model_fn=MgfModel, params=pretrained_model_config, run_config=args, train_dataset=train_loader, eval_dataset=eval_datasets, warm_start_setting=ws, exporters=[best_exporter], )