Beispiel #1
0
def main():
    if not os.path.exists(args.save_dir):
        os.makedirs(os.path.join(save_dir, 'zinc250k.png'))

    model = VGAE(args.in_dim, args.hidden_dims, zdim=16, device=device)
    model.to(device)

    print('Loading data')
    with open(args.data_file, 'rb') as f:
        graphs = dill.load(f)
    print('Loaded {} molecules'.format(len(graphs)))
    train_graphs, val_graphs = train_test_split(graphs, test_size=10000)
    train_dataset = MolDataset(train_graphs)
    val_dataset = MolDataset(val_graphs)
    del train_graphs, val_graphs

    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              collate_fn=collate)
    val_loader = DataLoader(val_dataset,
                            batch_size=args.batch_size,
                            shuffle=False,
                            collate_fn=collate)
    trainer = Trainer(model, args)
    train_losses, val_losses = [], []
    train_loss = 0
    print('Training Start')
    t = trange(args.n_epochs, desc="Loss: 0.0", leave=True)
    for epoch in t:
        t.set_description("Loss: {}".format(train_loss))
        t.refresh()
        train_loss = 0
        model.train()
        for bg in tqdm(train_loader):
            bg.set_e_initializer(dgl.init.zero_initializer)
            bg.set_n_initializer(dgl.init.zero_initializer)
            train_loss += trainer.iteration(bg)
        train_loss /= len(train_loader)
        train_losses.append(train_loss)
        trainer.save(epoch, args.save_dir)

        val_loss = 0
        model.eval()
        for bg in val_loader:
            bg.set_e_initializer(dgl.init.zero_initializer)
            bg.set_n_initializer(dgl.init.zero_initializer)
            val_loss += trainer.iteration(bg, train=False)
        val_loss /= len(val_loader)
        val_losses.append(val_loss)
        #print('Epoch: {:02d} | Train Loss: {:.4f} | Validation Loss: {:.4f}'.format(epoch, train_loss, val_loss))
    plot(train_losses, val_losses)
Beispiel #2
0
def infer(args):
    log.info("loading data")
    raw_dataset = GraphPropPredDataset(name=args.dataset_name)
    args.num_class = raw_dataset.num_tasks
    args.eval_metric = raw_dataset.eval_metric
    args.task_type = raw_dataset.task_type

    test_ds = MolDataset(args, raw_dataset, mode="test")

    fn = MgfCollateFn(args, mode="test")

    test_loader = Dataloader(test_ds,
                             batch_size=args.batch_size,
                             num_workers=1,
                             collate_fn=fn)
    test_loader = PDataset.from_generator_func(test_loader)

    est = propeller.Learner(MgfModel, args, args.model_config)

    mgf_list = []
    for soft_mgf in est.predict(test_loader,
                                ckpt_path=args.model_path_for_infer,
                                split_batch=True):
        mgf_list.append(soft_mgf)

    mgf = np.concatenate(mgf_list)
    log.info("saving features")
    np.save(
        "dataset/%s/soft_mgf_feat.npy" % (args.dataset_name.replace("-", "_")),
        mgf)
Beispiel #3
0
def get_dataset_dataloader(train_keys, test_keys, data_dir, id_to_y,
                           batch_size, num_workers, pos_noise_std):
    from torch.utils.data import DataLoader
    from dataset import MolDataset, tensor_collate_fn
    train_dataset = MolDataset(train_keys,
                               data_dir,
                               id_to_y,
                               pos_noise_std=pos_noise_std)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size,
                                  num_workers=num_workers,
                                  collate_fn=tensor_collate_fn,
                                  shuffle=True)

    test_dataset = MolDataset(test_keys, data_dir, id_to_y)
    test_dataloader = DataLoader(test_dataset,
                                 batch_size,
                                 num_workers=num_workers,
                                 collate_fn=tensor_collate_fn,
                                 shuffle=False)
    return train_dataset, train_dataloader, test_dataset, test_dataloader
Beispiel #4
0
    print(f"No {args.potential} potential")
    exit(-1)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = utils.initialize_model(model, device, args.restart_file)

print(f"vina_hbond_coeff: {model.vina_hbond_coeff.data.cpu().numpy()[0]:.3f}")
print(f"vina_hydrophobic_coeff: \
{model.vina_hydrophobic_coeff.data.cpu().numpy()[0]:.3f}")
print(f"rotor_coeff: {model.rotor_coeff.data.cpu().numpy()[0]:.3f}")
print(f"vdw_coeff: {model.vdw_coeff.data.cpu().numpy()[0]:.3f}")
# exit(-1)
print("number of parameters : ",
      sum(p.numel() for p in model.parameters() if p.requires_grad))

# Dataloader
test_dataset = MolDataset(test_keys, args.data_dir, id_to_y)
test_data_loader = DataLoader(test_dataset,
                              args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers,
                              collate_fn=tensor_collate_fn)

# test
st = time.time()

test_losses1 = []
test_losses2 = []

test_pred = dict()
test_true = dict()
Beispiel #5
0
from rdkit import Chem
from torch import nn
from torch.utils.data import DataLoader
from tqdm import tqdm
from widis_lstm_tools.preprocessing import random_dataset_split, inds_to_one_hot

from dataset import MolDataset
from model import MoLSTM
from train import train

# device
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

# load dataset
filepath = r'results\fifth_submission.txt'
data = MolDataset(filepath)

# data splitting
train_data, test_data = random_dataset_split(data,
                                             split_sizes=(90 / 100., 10 / 100))

# data loader
train_loader = DataLoader(train_data, batch_size=128, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=True)

# model
model = MoLSTM(n_inputs=len(data.id2char), hidden_size=128)
model = model.to(device)
criterion = nn.CrossEntropyLoss()

# Optimizer, scheduler
Beispiel #6
0
#print simple statistics about dude data and pdbbind data
print(f'Number of train data: {len(train_keys)}')
print(f'Number of test data: {len(test_keys)}')

#initialize model
if args.ngpu > 0:
    cmd = utils.set_cuda_visible_device(args.ngpu)
    os.environ['CUDA_VISIBLE_DEVICES'] = cmd[:-1]
model = gnn(args)
print('number of parameters : ',
      sum(p.numel() for p in model.parameters() if p.requires_grad))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = utils.initialize_model(model, device)

#train and test dataset
train_dataset = MolDataset(train_keys, args.dude_data_fpath)
test_dataset = MolDataset(test_keys, args.dude_data_fpath)
num_train_chembl = len([0 for k in train_keys if 'CHEMBL' in k])
num_train_decoy = len([0 for k in train_keys if 'CHEMBL' not in k])
train_weights = [
    1 / num_train_chembl if 'CHEMBL' in k else 1 / num_train_decoy
    for k in train_keys
]
train_sampler = DTISampler(train_weights, len(train_weights), replacement=True)
train_dataloader = DataLoader(train_dataset, args.batch_size, \
     shuffle=False, num_workers = args.num_workers, collate_fn=collate_fn,\
     sampler = train_sampler)
test_dataloader = DataLoader(test_dataset, args.batch_size, \
     shuffle=False, num_workers = args.num_workers, collate_fn=collate_fn)

#optimizer
Beispiel #7
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(
        description='GNN baselines on pcqm4m with PGL')
    parser.add_argument('--use_cuda', action='store_true')
    parser.add_argument('--device',
                        type=int,
                        default=0,
                        help='which gpu to use if any (default: 0)')
    parser.add_argument(
        '--gnn',
        type=str,
        default='gin-virtual',
        help=
        'GNN gin, gin-virtual, or gcn, or gcn-virtual (default: gin-virtual)')
    parser.add_argument(
        '--graph_pooling',
        type=str,
        default='sum',
        help='graph pooling strategy mean or sum (default: sum)')
    parser.add_argument('--drop_ratio',
                        type=float,
                        default=0,
                        help='dropout ratio (default: 0)')
    parser.add_argument(
        '--num_layers',
        type=int,
        default=5,
        help='number of GNN message passing layers (default: 5)')
    parser.add_argument(
        '--emb_dim',
        type=int,
        default=600,
        help='dimensionality of hidden units in GNNs (default: 600)')
    parser.add_argument('--train_subset', action='store_true')
    parser.add_argument('--batch_size',
                        type=int,
                        default=256,
                        help='input batch size for training (default: 256)')
    parser.add_argument('--epochs',
                        type=int,
                        default=100,
                        help='number of epochs to train (default: 100)')
    parser.add_argument('--num_workers',
                        type=int,
                        default=1,
                        help='number of workers (default: 1)')
    parser.add_argument('--log_dir',
                        type=str,
                        default="",
                        help='tensorboard log directory')
    parser.add_argument('--checkpoint_dir',
                        type=str,
                        default='',
                        help='directory to save checkpoint')
    parser.add_argument('--save_test_dir',
                        type=str,
                        default='',
                        help='directory to save test submission file')
    args = parser.parse_args()

    print(args)

    random.seed(42)
    np.random.seed(42)
    paddle.seed(42)

    if not args.use_cuda:
        paddle.set_device("cpu")

    ### automatic dataloading and splitting
    class Config():
        def __init__(self):
            self.base_data_path = "./dataset"

    config = Config()
    ds = MolDataset(config)
    split_idx = ds.get_idx_split()
    test_ds = Subset(ds, split_idx['test'])

    print("Test exapmles: ", len(test_ds))

    ### automatic evaluator. takes dataset name as input
    evaluator = PCQM4MEvaluator()

    test_loader = Dataloader(test_ds,
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers,
                             collate_fn=CollateFn())

    shared_params = {
        'num_layers': args.num_layers,
        'emb_dim': args.emb_dim,
        'drop_ratio': args.drop_ratio,
        'graph_pooling': args.graph_pooling
    }

    if args.gnn == 'gin':
        model = GNN(gnn_type='gin', virtual_node=False, **shared_params)
    elif args.gnn == 'gin-virtual':
        model = GNN(gnn_type='gin', virtual_node=True, **shared_params)
    elif args.gnn == 'gcn':
        model = GNN(gnn_type='gcn', virtual_node=False, **shared_params)
    elif args.gnn == 'gcn-virtual':
        model = GNN(gnn_type='gcn', virtual_node=True, **shared_params)
    else:
        raise ValueError('Invalid GNN type')

    num_params = sum(p.numel() for p in model.parameters())
    print(f'#Params: {num_params}')

    checkpoint_path = os.path.join(args.checkpoint_dir, 'checkpoint.pdparams')
    if not os.path.exists(checkpoint_path):
        raise RuntimeError(f'Checkpoint file not found at {checkpoint_path}')

    model.set_state_dict(paddle.load(checkpoint_path))

    print('Predicting on test data...')
    y_pred = test(model, test_loader)
    print('Saving test submission file...')
    evaluator.save_test_submission({'y_pred': y_pred}, args.save_test_dir)
Beispiel #8
0
if args.ngpu>0:
    cmd = utils.set_cuda_visible_device(args.ngpu)
    os.environ['CUDA_VISIBLE_DEVICES']=cmd[:-1]
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

c_to_i = pickle.load(open(args.c_to_i, 'rb'))
i_to_c = pickle.load(open(args.i_to_c, 'rb'))
n_char = len(c_to_i)

dataloaders = []

for fn in args.filenames:
    with open(fn) as f:
        lines = f.readlines()
        lines = [s.strip().split()[1] for s in lines]
        test_dataset = MolDataset(lines, c_to_i)
    test_dataloader = DataLoader(test_dataset, args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=my_collate)
    dataloaders.append(test_dataloader)
        

model = model.RNN(args.n_feature, args.n_feature, n_char, args.n_layer, i_to_c)

model = utils.initialize_model(model, device, args.save_files)

print("number of parameters :", sum(p.numel() for p in model.parameters() if p.requires_grad))

model.eval()
for fn,dataloader in zip(args.filenames, dataloaders):
    log_likelihoods = []
    for i_batch, sample in enumerate(dataloader) :
        x, l = sample['X'].to(device).long(), sample['L'].long().data.cpu().numpy()
Beispiel #9
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(
        description='GNN baselines on pcqm4m with PGL')
    parser.add_argument('--use_cuda', action='store_true')
    parser.add_argument('--device',
                        type=int,
                        default=0,
                        help='which gpu to use if any (default: 0)')
    parser.add_argument(
        '--gnn',
        type=str,
        default='gin-virtual',
        help=
        'GNN gin, gin-virtual, or gcn, or gcn-virtual (default: gin-virtual)')
    parser.add_argument(
        '--graph_pooling',
        type=str,
        default='sum',
        help='graph pooling strategy mean or sum (default: sum)')
    parser.add_argument('--drop_ratio',
                        type=float,
                        default=0,
                        help='dropout ratio (default: 0)')
    parser.add_argument(
        '--num_layers',
        type=int,
        default=5,
        help='number of GNN message passing layers (default: 5)')
    parser.add_argument(
        '--emb_dim',
        type=int,
        default=600,
        help='dimensionality of hidden units in GNNs (default: 600)')
    parser.add_argument('--train_subset', action='store_true')
    parser.add_argument('--batch_size',
                        type=int,
                        default=256,
                        help='input batch size for training (default: 256)')
    parser.add_argument('--epochs',
                        type=int,
                        default=100,
                        help='number of epochs to train (default: 100)')
    parser.add_argument('--num_workers',
                        type=int,
                        default=1,
                        help='number of workers (default: 1)')
    parser.add_argument('--log_dir',
                        type=str,
                        default="",
                        help='tensorboard log directory')
    parser.add_argument('--checkpoint_dir',
                        type=str,
                        default='',
                        help='directory to save checkpoint')
    parser.add_argument('--save_test_dir',
                        type=str,
                        default='',
                        help='directory to save test submission file')
    args = parser.parse_args()

    print(args)

    random.seed(42)
    np.random.seed(42)
    paddle.seed(42)

    if not args.use_cuda:
        paddle.set_device("cpu")

    ### automatic dataloading and splitting
    class Config():
        def __init__(self):
            self.base_data_path = "./dataset"

    config = Config()
    ds = MolDataset(config)

    split_idx = ds.get_idx_split()
    train_ds = Subset(ds, split_idx['train'])
    valid_ds = Subset(ds, split_idx['valid'])
    test_ds = Subset(ds, split_idx['test'])

    print("Train exapmles: ", len(train_ds))
    print("Valid exapmles: ", len(valid_ds))
    print("Test exapmles: ", len(test_ds))

    ### automatic evaluator. takes dataset name as input
    evaluator = PCQM4MEvaluator()

    train_loader = Dataloader(train_ds,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers,
                              collate_fn=CollateFn())

    valid_loader = Dataloader(valid_ds,
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers,
                              collate_fn=CollateFn())

    if args.save_test_dir is not '':
        test_loader = Dataloader(test_ds,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=CollateFn())

    if args.checkpoint_dir is not '':
        os.makedirs(args.checkpoint_dir, exist_ok=True)

    shared_params = {
        'num_layers': args.num_layers,
        'emb_dim': args.emb_dim,
        'drop_ratio': args.drop_ratio,
        'graph_pooling': args.graph_pooling
    }

    if args.gnn == 'gin':
        model = GNN(gnn_type='gin', virtual_node=False, **shared_params)
    elif args.gnn == 'gin-virtual':
        model = GNN(gnn_type='gin', virtual_node=True, **shared_params)
    elif args.gnn == 'gcn':
        model = GNN(gnn_type='gcn', virtual_node=False, **shared_params)
    elif args.gnn == 'gcn-virtual':
        model = GNN(gnn_type='gcn', virtual_node=True, **shared_params)
    else:
        raise ValueError('Invalid GNN type')

    num_params = sum(p.numel() for p in model.parameters())
    print(f'#Params: {num_params}')

    if args.log_dir is not '':
        writer = SummaryWriter(log_dir=args.log_dir)

    best_valid_mae = 1000

    scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.001,
                                              step_size=300,
                                              gamma=0.25)

    optimizer = paddle.optimizer.Adam(learning_rate=scheduler,
                                      parameters=model.parameters())

    msg = "ogbg_lsc_paddle_baseline\n"
    for epoch in range(1, args.epochs + 1):
        print("=====Epoch {}".format(epoch))
        print('Training...')
        train_mae = train(model, train_loader, optimizer)

        print('Evaluating...')
        valid_mae = eval(model, valid_loader, evaluator)

        print({'Train': train_mae, 'Validation': valid_mae})

        if args.log_dir is not '':
            writer.add_scalar('valid/mae', valid_mae, epoch)
            writer.add_scalar('train/mae', train_mae, epoch)

        if valid_mae < best_valid_mae:
            best_valid_mae = valid_mae
            if args.checkpoint_dir is not '':
                print('Saving checkpoint...')
                paddle.save(
                    model.state_dict(),
                    os.path.join(args.checkpoint_dir, 'checkpoint.pdparams'))

            if args.save_test_dir is not '':
                print('Predicting on test data...')
                y_pred = test(model, test_loader)
                print('Saving test submission file...')
                evaluator.save_test_submission({'y_pred': y_pred},
                                               args.save_test_dir)

        scheduler.step()

        print(f'Best validation MAE so far: {best_valid_mae}')

        try:
            msg +="Epoch: %d | Train: %.6f | Valid: %.6f | Best Valid: %.6f\n" \
                    % (epoch, train_mae, valid_mae, best_valid_mae)
            print(msg)
        except:
            continue

    if args.log_dir is not '':
        writer.close()
Beispiel #10
0
def main():
    now = time.localtime()
    s = "%04d-%02d-%02d %02d:%02d:%02d" % (now.tm_year, now.tm_mon,
                                           now.tm_mday, now.tm_hour,
                                           now.tm_min, now.tm_sec)
    print(s)

    parser = argparse.ArgumentParser()

    parser.add_argument("--lr",
                        help="learning rate",
                        type=float,
                        default=0.0001)
    parser.add_argument("--epoch", help="epoch", type=int, default=10000)
    parser.add_argument("--ngpu", help="number of gpu", type=int, default=1)
    parser.add_argument("--batch_size",
                        help="batch_size",
                        type=int,
                        default=32)
    parser.add_argument("--num_workers",
                        help="number of workers",
                        type=int,
                        default=7)
    parser.add_argument("--n_graph_layer",
                        help="number of GNN layer",
                        type=int,
                        default=4)
    parser.add_argument("--d_graph_layer",
                        help="dimension of GNN layer",
                        type=int,
                        default=140)
    parser.add_argument("--n_FC_layer",
                        help="number of FC layer",
                        type=int,
                        default=4)
    parser.add_argument("--d_FC_layer",
                        help="dimension of FC layer",
                        type=int,
                        default=128)
    parser.add_argument("--dude_data_fpath",
                        help="file path of dude data",
                        type=str,
                        default='data/')
    parser.add_argument("--save_dir",
                        help="save directory of model parameter",
                        type=str,
                        default='./save/')
    parser.add_argument("--initial_mu",
                        help="initial value of mu",
                        type=float,
                        default=4.0)
    parser.add_argument("--initial_dev",
                        help="initial value of dev",
                        type=float,
                        default=1.0)
    parser.add_argument("--dropout_rate",
                        help="dropout_rate",
                        type=float,
                        default=0.0)
    parser.add_argument("--train_keys",
                        help="train keys",
                        type=str,
                        default='keys/train_keys.pkl')
    parser.add_argument("--test_keys",
                        help="test keys",
                        type=str,
                        default='keys/test_keys.pkl')
    args = parser.parse_args()
    print(args)

    #hyper parameters
    num_epochs = args.epoch
    lr = args.lr
    ngpu = args.ngpu
    batch_size = args.batch_size
    dude_data_fpath = args.dude_data_fpath
    save_dir = args.save_dir

    #make save dir if it doesn't exist
    if not os.path.isdir(save_dir):
        os.system('mkdir ' + save_dir)
        print('save_dir({}) created'.format(save_dir))
        pass

    print('save_dir:{}'.format(save_dir))
    print('+' * 10)

    #read data. data is stored in format of dictionary. Each key has information about protein-ligand complex.
    with open(args.train_keys, 'rb') as fp:
        train_keys = pickle.load(fp)
        #
        # train_keys: type=list, len=730, ['andr_C36276925', 'dhi1_C08592133', 'hivpr_C59233791', 'hivrt_C66397637', 'cah2_C62892628', ... ]
        #
        print('train_keys({}) loaded from pickle --> type:{}, len:{}, ex:\n{}'.
              format(args.train_keys, type(train_keys), len(train_keys),
                     train_keys[:5]))
        pass

    print('+' * 3)

    with open(args.test_keys, 'rb') as fp:
        test_keys = pickle.load(fp)
        #
        # test_keys: type=list, len=255, ['fnta_C59365794', 'ace_C22923016', 'aces_C21842010', 'kith_C11223989', 'kpcb_C37928874', ... ]
        #
        print('test_keys({}) loaded from pickle --> type:{}, len:{}, ex:\n{}'.
              format(args.test_keys, type(test_keys), len(test_keys),
                     test_keys[:5]))
        pass

    print('+' * 10)

    #print simple statistics about dude data and pdbbind data
    print(f'Number of train data: {len(train_keys)}')
    print(f'Number of test data: {len(test_keys)}')

    if 0 < args.ngpu:
        cmd = utils.set_cuda_visible_device(args.ngpu)
        print('utils.set_cuda_visible_device({}) --> cmd:{}'.format(
            args.ngpu, cmd))
        os.environ['CUDA_VISIBLE_DEVICES'] = cmd[:-1]
        pass

    model = gnn(args)

    print('+' * 10)

    print('number of parameters : ',
          sum(p.numel() for p in model.parameters() if p.requires_grad))

    #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    device = torch.device(
        "cuda:0" if torch.cuda.is_available() and 0 < args.ngpu else "cpu")

    print('device: {}'.format(device))

    # initialize model
    model = utils.initialize_model(model, device)

    print('#' * 80)

    print('dude_data_fpath:{}'.format(args.dude_data_fpath))

    #train and test dataset
    train_dataset = MolDataset(train_keys, args.dude_data_fpath)
    test_dataset = MolDataset(test_keys, args.dude_data_fpath)

    print('#' * 80)

    num_train_chembl = len([0 for k in train_keys if 'CHEMBL' in k])
    num_train_decoy = len([0 for k in train_keys if 'CHEMBL' not in k])

    print('#1:num_train_chembl:{}, num_train_decoy:{}'.format(
        num_train_chembl, num_train_decoy))

    num_train_chembl = len([0 for k in train_keys if 'CHEMBL' in k])
    num_train_decoy = len(train_keys) - num_train_chembl

    print('#2:num_train_chembl:{}, num_train_decoy:{}'.format(
        num_train_chembl, num_train_decoy))

    #train_weights = [1/num_train_chembl if 'CHEMBL' in k else 1/num_train_decoy for k in train_keys]
    train_weight_chembl = 1.0 / num_train_chembl
    train_weight_decoy = 1.0 / num_train_decoy
    train_weights = [
        train_weight_chembl if 'CHEMBL' in k else train_weight_decoy
        for k in train_keys
    ]

    print('main: sum(train_weights):{}'.format(sum(train_weights)))
    print(
        'train_weight_chembl:{} / train_weight_decoy:{}, len(train_weights):{}'
        .format(train_weight_chembl, train_weight_decoy, len(train_weights)))

    train_sampler = DTISampler(train_weights,
                               len(train_weights),
                               replacement=True)

    print('main: args.batch_size:{}, args.num_workers:{}'.format(
        args.batch_size, args.num_workers))

    #
    # train_dataset: object of MolDataset(torch.utils.data.Dataset)
    #
    train_dataloader = DataLoader(train_dataset, args.batch_size, \
         shuffle=False, num_workers = args.num_workers, collate_fn=collate_fn,\
         sampler = train_sampler)

    #
    # test_dataset: object of MolDataset(torch.utils.data.Dataset)
    #
    test_dataloader = DataLoader(test_dataset, args.batch_size, \
         shuffle=False, num_workers = args.num_workers, collate_fn=collate_fn, \
         )

    #optimizer
    #optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)

    #loss function --> BCELoss (Binary Classification Error)
    #loss_fn = nn.BCELoss()
    loss_fn = nn.CrossEntropyLoss()

    test_roc_list = list()
    best_test_roc = 0.0

    for epoch in range(num_epochs):
        st = time.time()
        #collect losses of each iteration
        train_losses = []
        test_losses = []

        #collect true label of each iteration
        train_true = []
        test_true = []

        #collect predicted label of each iteration
        train_pred = []
        test_pred = []

        model.train()  # sets the model in training mode.
        #print('model.training:{}'.format(model.training))

        for i_batch, sample in enumerate(train_dataloader):
            model.zero_grad()
            H, A1, A2, Y, V, keys = sample

            n_queried, n_max_n1, n_max_n2, n_max_adj, n_file_opened = train_dataset.get_n_queried(
            )

            if epoch == 0 and i_batch == 0:
                print('#1:{}/{} H:type:{}, shape:{}\n{}'.format(
                    i_batch, epoch, type(H), H.shape, H))
                print('    A1:type:{}, shape:{}\n{}'.format(
                    type(A1), A1.shape, A1))
                print('    A2:type:{}, shape:{}\n{}'.format(
                    type(A2), A2.shape, A2))
                print('    Y:type:{}, shape:{}\n{}'.format(
                    type(Y), Y.shape, Y))
                print('    V:type:{}, shape:{}\n{}'.format(
                    type(V), V.shape, V))
                print('    keys:type:{}\n{}'.format(type(keys), keys))
                print(
                    '    train_dataset: n_queried:{}, n_max_n1:{}, n_max_n2:{}, n_max_adj:{}, n_file_opened:{}'
                    .format(n_queried, n_max_n1, n_max_n2, n_max_adj,
                            n_file_opened))
                print('+' * 10)
                pass

            H, A1, A2, Y, V = H.to(device), A1.to(device), A2.to(device),\
                                Y.to(device), V.to(device)

            if epoch == 0 and i_batch == 0:
                print('#2:{}/{} H:type:{}, shape:{}\n{}'.format(
                    i_batch, epoch, type(H), H.shape, H))
                print('    A1:type:{}, shape:{}\n{}'.format(
                    type(A1), A1.shape, A1))
                print('    A2:type:{}, shape:{}\n{}'.format(
                    type(A2), A2.shape, A2))
                print('    Y:type:{}, shape:{}\n{}'.format(
                    type(Y), Y.shape, Y))
                print('    V:type:{}, shape:{}\n{}'.format(
                    type(V), V.shape, V))
                print('    keys:type:{}\n{}'.format(type(keys), keys))
                print(
                    '    train_dataset: n_queried:{}, n_max_n1:{}, n_max_n2:{}, n_max_adj:{}, n_file_opened:{}'
                    .format(n_queried, n_max_n1, n_max_n2, n_max_adj,
                            n_file_opened))
                print('+' * 10)
                pass

            #train neural network
            pred = model.train_model((H, A1, A2, V))
            #pred = model.module.train_model((H, A1, A2, V))
            pred = pred.cpu()
            pred_softmax = pred.detach().numpy()
            pred_softmax = softmax(pred_softmax, axis=1)[:, 1]

            if epoch == 0 and i_batch == 0:
                print('{}/{} pred:shape:{}\n{}\nY.shape:{}'.format(
                    i_batch, epoch, pred.shape, pred, Y.shape))
                print('+' * 10)
                print('{}/{} pred_softmax:shape:{}\n{}'.format(
                    i_batch, epoch, pred_softmax.shape, pred_softmax))
                print('+' * 10)
                pass

            loss = loss_fn(pred, Y)

            if epoch == 0 and i_batch == 0:
                print('{}/{} loss:shape:{}\n{}'.format(i_batch, epoch,
                                                       loss.shape, loss))
                print('+' * 10)
                pass

            loss.backward()
            optimizer.step()

            #collect loss, true label and predicted label
            train_losses.append(loss.data.cpu().numpy())
            train_true.append(Y.data.cpu().numpy())
            #train_pred.append(pred.data.cpu().numpy())
            train_pred.append(pred_softmax)
            #if i_batch>10 : break

            pass  # end of for i_batch,sample

        model.eval()  # equivalent with model.train(mode=False)
        for i_batch, sample in enumerate(test_dataloader):
            model.zero_grad()

            H, A1, A2, Y, V, keys = sample
            H, A1, A2, Y, V = H.to(device), A1.to(device), A2.to(device),\
                              Y.to(device), V.to(device)

            #train neural network
            pred = model.train_model((H, A1, A2, V))
            pred_softmax = pred.detach().numpy()
            pred_softmax = softmax(pred_softmax, axis=1)[:, 1]

            loss = loss_fn(pred, Y)

            #collect loss, true label and predicted label
            test_losses.append(loss.data.cpu().numpy())
            test_true.append(Y.data.cpu().numpy())
            #test_pred.append(pred.data.cpu().numpy())
            test_pred.append(pred_softmax)
            #if i_batch>10 : break

            if epoch == 0 and i_batch == 0:
                print('eval: Y.shape:{}, pred.shape:{}, pred_softmax.shape:{}'.
                      format(Y.shape, pred.shape, pred_softmax.shape))
                pass
            pass

        train_losses = np.mean(np.array(train_losses))
        test_losses = np.mean(np.array(test_losses))

        train_pred = np.concatenate(np.array(train_pred), 0)
        test_pred = np.concatenate(np.array(test_pred), 0)

        train_true = np.concatenate(np.array(train_true), 0)
        test_true = np.concatenate(np.array(test_true), 0)

        #print('#' * 80)
        #print('train_pred:\n{}'.format(train_pred))
        #print('+' * 7)
        ##print(softmax(train_pred, axis=1))

        #print('+' * 10)
        #print('+' * 10)

        #print('train_true:\n{}'.format(train_true))
        #print('#' * 80, flush=True)

        train_roc = roc_auc_score(train_true, train_pred)
        test_roc = roc_auc_score(test_true, test_pred)

        end = time.time()
        if epoch == 0:
            print(
                'epoch\ttrain_losses\ttest_losses\ttrain_roc\ttest_roc\telapsed_time'
            )
            pass
        #print('#' * 80)
        #print ('epoch\ttrain_losses\ttest_losses\ttrain_roc\ttest_roc\telapsed_time')
        #print ("%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f" \
        print ('%s\t%.6f\t%.6f\t%.6f\t%.6f\t%.6f\t%s' \
               % (epoch, train_losses, test_losses, train_roc, test_roc, end-st, datetime.datetime.fromtimestamp(end).strftime('%Y-%m-%d %H:%M:%S.%f')),
               end='')

        #name = save_dir + '/save_'+str(epoch)+'.pt'
        #torch.save(model.state_dict(), name)
        if best_test_roc < test_roc:
            name = save_dir + '/save_' + str(epoch) + '.pt'
            torch.save(model.state_dict(), name)
            print(' updated')

            best_test_roc = test_roc
            pass
        else:
            print('')
            pass

        test_roc_list.append(test_roc)
        pass
    pass
Beispiel #11
0
def main(config):
    if dist.get_world_size() > 1:
        dist.init_parallel_env()

    if dist.get_rank() == 0:
        timestamp = datetime.now().strftime("%Hh%Mm%Ss")
        log_path = os.path.join(config.log_dir,
                                "tensorboard_log_%s" % timestamp)
        writer = SummaryWriter(log_path)

    log.info("loading data")
    raw_dataset = GraphPropPredDataset(name=config.dataset_name)
    config.num_class = raw_dataset.num_tasks
    config.eval_metric = raw_dataset.eval_metric
    config.task_type = raw_dataset.task_type

    mol_dataset = MolDataset(config,
                             raw_dataset,
                             transform=make_multihop_edges)
    splitted_index = raw_dataset.get_idx_split()
    train_ds = Subset(mol_dataset, splitted_index['train'], mode='train')
    valid_ds = Subset(mol_dataset, splitted_index['valid'], mode="valid")
    test_ds = Subset(mol_dataset, splitted_index['test'], mode="test")

    log.info("Train Examples: %s" % len(train_ds))
    log.info("Val Examples: %s" % len(valid_ds))
    log.info("Test Examples: %s" % len(test_ds))

    fn = CollateFn(config)

    train_loader = Dataloader(train_ds,
                              batch_size=config.batch_size,
                              shuffle=True,
                              num_workers=config.num_workers,
                              collate_fn=fn)

    valid_loader = Dataloader(valid_ds,
                              batch_size=config.batch_size,
                              num_workers=config.num_workers,
                              collate_fn=fn)

    test_loader = Dataloader(test_ds,
                             batch_size=config.batch_size,
                             num_workers=config.num_workers,
                             collate_fn=fn)

    model = ClassifierNetwork(config.hidden_size, config.out_dim,
                              config.num_layers, config.dropout_prob,
                              config.virt_node, config.K, config.conv_type,
                              config.appnp_hop, config.alpha)
    model = paddle.DataParallel(model)

    optim = Adam(learning_rate=config.lr, parameters=model.parameters())
    criterion = nn.loss.BCEWithLogitsLoss()

    evaluator = Evaluator(config.dataset_name)

    best_valid = 0

    global_step = 0
    for epoch in range(1, config.epochs + 1):
        model.train()
        for idx, batch_data in enumerate(train_loader):
            g, mh_graphs, labels, unmask = batch_data
            g = g.tensor()
            multihop_graphs = []
            for item in mh_graphs:
                multihop_graphs.append(item.tensor())
            g.multi_hop_graphs = multihop_graphs
            labels = paddle.to_tensor(labels)
            unmask = paddle.to_tensor(unmask)

            pred = model(g)
            pred = paddle.masked_select(pred, unmask)
            labels = paddle.masked_select(labels, unmask)
            train_loss = criterion(pred, labels)
            train_loss.backward()
            optim.step()
            optim.clear_grad()

            if global_step % 80 == 0:
                message = "train: epoch %d | step %d | " % (epoch, global_step)
                message += "loss %.6f" % (train_loss.numpy())
                log.info(message)
                if dist.get_rank() == 0:
                    writer.add_scalar("loss", train_loss.numpy(), global_step)
            global_step += 1

        valid_result = evaluate(model, valid_loader, criterion, evaluator)
        message = "valid: epoch %d | step %d | " % (epoch, global_step)
        for key, value in valid_result.items():
            message += " | %s %.6f" % (key, value)
            if dist.get_rank() == 0:
                writer.add_scalar("valid_%s" % key, value, global_step)
        log.info(message)

        test_result = evaluate(model, test_loader, criterion, evaluator)
        message = "test: epoch %d | step %d | " % (epoch, global_step)
        for key, value in test_result.items():
            message += " | %s %.6f" % (key, value)
            if dist.get_rank() == 0:
                writer.add_scalar("test_%s" % key, value, global_step)
        log.info(message)

        if best_valid < valid_result[config.metrics]:
            best_valid = valid_result[config.metrics]
            best_valid_result = valid_result
            best_test_result = test_result

        message = "best result: epoch %d | " % (epoch)
        message += "valid %s: %.6f | " % (config.metrics,
                                          best_valid_result[config.metrics])
        message += "test %s: %.6f | " % (config.metrics,
                                         best_test_result[config.metrics])
        log.info(message)

    message = "final eval best result:%.6f" % best_valid_result[config.metrics]
    log.info(message)
    message = "final test best result:%.6f" % best_test_result[config.metrics]
    log.info(message)
Beispiel #12
0
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

save_dir = args.save_dir

if not os.path.isdir(save_dir):
    os.system('mkdir ' + save_dir)

c_to_i = pickle.load(open(args.c_to_i, 'rb'))
i_to_c = pickle.load(open(args.i_to_c, 'rb'))
n_char = len(c_to_i)

print('c_to_i:', c_to_i)
with open(args.train_filenames) as f:
    lines = f.readlines()
    train_lines = [s.strip().split()[1] for s in lines]
train_dataset = MolDataset(train_lines, dict(c_to_i), args.enumerate_smiles,
                           args.stereo)

with open(args.test_filenames) as f:
    lines = f.readlines()
    test_lines = [s.strip().split()[1] for s in lines]
test_dataset = MolDataset(test_lines, dict(c_to_i), args.enumerate_smiles,
                          args.stereo)

train_dataloader = DataLoader(train_dataset,
                              args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers,
                              collate_fn=my_collate)
test_dataloader = DataLoader(test_dataset,
                             args.batch_size,
                             shuffle=False,
Beispiel #13
0
def train(args, pretrained_model_config=None):
    log.info("loading data")
    raw_dataset = GraphPropPredDataset(name=args.dataset_name)
    args.num_class = raw_dataset.num_tasks
    args.eval_metric = raw_dataset.eval_metric
    args.task_type = raw_dataset.task_type

    train_ds = MolDataset(args, raw_dataset)

    args.eval_steps = math.ceil(len(train_ds) / args.batch_size)
    log.info("Total %s steps (eval_steps) every epoch." % (args.eval_steps))

    fn = MgfCollateFn(args)

    train_loader = Dataloader(train_ds,
                              batch_size=args.batch_size,
                              num_workers=args.num_workers,
                              shuffle=args.shuffle,
                              stream_shuffle_size=args.shuffle_size,
                              collate_fn=fn)

    # for evaluating
    eval_train_loader = train_loader
    eval_train_loader = PDataset.from_generator_func(eval_train_loader)

    train_loader = multi_epoch_dataloader(train_loader, args.epochs)
    train_loader = PDataset.from_generator_func(train_loader)

    if args.warm_start_from is not None:
        # warm start setting
        def _fn(v):
            if not isinstance(v, F.framework.Parameter):
                return False
            if os.path.exists(os.path.join(args.warm_start_from, v.name)):
                return True
            else:
                return False

        ws = propeller.WarmStartSetting(predicate_fn=_fn,
                                        from_dir=args.warm_start_from)
    else:
        ws = None

    def cmp_fn(old, new):
        if old['eval'][args.metrics] - new['eval'][args.metrics] > 0:
            log.info("best %s eval result: %s" % (args.metrics, new['eval']))
            return True
        else:
            return False

    if args.log_id is not None:
        save_best_model = int(args.log_id) == 5
    else:
        save_best_model = True
    best_exporter = propeller.exporter.BestResultExporter(
        args.output_dir, (cmp_fn, save_best_model))

    eval_datasets = {"eval": eval_train_loader}

    propeller.train.train_and_eval(
        model_class_or_model_fn=MgfModel,
        params=pretrained_model_config,
        run_config=args,
        train_dataset=train_loader,
        eval_dataset=eval_datasets,
        warm_start_setting=ws,
        exporters=[best_exporter],
    )