Ejemplo n.º 1
0
    def __FeaturizerSimple(self, mols) -> list:
        atom_featurizer = BaseAtomFeaturizer({
            "n_feat":
            ConcatFeaturizer(
                [
                    # partial(atom_type_one_hot,
                    #        allowable_set=['C', 'N', 'O', 'F', 'Si', 'S'],
                    #        encode_unknown=True),
                    # partial(atom_degree_one_hot, allowable_set=list(range(6))),
                    atom_is_aromatic,
                    atom_formal_charge,
                    atom_num_radical_electrons,
                    partial(atom_hybridization_one_hot, encode_unknown=True),
                    lambda atom: [0
                                  ],  # A placeholder for aromatic information,
                    atom_total_num_H_one_hot,
                ], )
        })
        bond_featurizer = BaseBondFeaturizer(
            {"e_feat": ConcatFeaturizer([bond_type_one_hot, bond_is_in_ring])})

        train_graph = [
            mol_to_bigraph(mol,
                           node_featurizer=atom_featurizer,
                           edge_featurizer=bond_featurizer) for mol in mols
        ]
        return train_graph
Ejemplo n.º 2
0
 def __Featurizer(self, train_mols) -> list:
     atom_featurizer = BaseAtomFeaturizer({
         "n_feat":
         ConcatFeaturizer(
             [
                 partial(
                     atom_type_one_hot,
                     allowable_set=["C", "N", "O", "F", "Si", "P", "S"],
                     encode_unknown=True,
                 ),
                 partial(atom_degree_one_hot, allowable_set=list(range(6))),
                 atom_is_aromatic,
                 atom_formal_charge,
                 atom_num_radical_electrons,
                 partial(atom_hybridization_one_hot, encode_unknown=True),
                 atom_implicit_valence,
                 lambda atom: [0
                               ],  # A placeholder for aromatic information,
                 atom_total_num_H_one_hot,
             ], )
     })
     bond_featurizer = BaseBondFeaturizer(
         {"e_feat": ConcatFeaturizer([bond_type_one_hot, bond_is_in_ring])})
     afp_train_graph = [
         mol_to_bigraph(mol,
                        node_featurizer=atom_featurizer,
                        edge_featurizer=bond_featurizer)
         for mol in tqdm(train_mols)
     ]
     return afp_train_graph
Ejemplo n.º 3
0
    'lower',
    'smiles_to_graph':
    smiles_to_bigraph,
    # Follow the atom featurization in the original work
    'node_featurizer':
    BaseAtomFeaturizer(
        featurizer_funcs={
            'hv':
            ConcatFeaturizer(
                [
                    partial(atom_type_one_hot,
                            allowable_set=[
                                'B', 'C', 'N', 'O', 'F', 'Si', 'P', 'S', 'Cl',
                                'As', 'Se', 'Br', 'Te', 'I', 'At'
                            ],
                            encode_unknown=True),
                    partial(atom_degree_one_hot, allowable_set=list(range(6))),
                    atom_formal_charge,
                    atom_num_radical_electrons,
                    partial(atom_hybridization_one_hot, encode_unknown=True),
                    lambda atom: [0
                                  ],  # A placeholder for aromatic information,
                    atom_total_num_H_one_hot,
                    chirality
                ], )
        }),
    'edge_featurizer':
    BaseBondFeaturizer({'he': lambda bond: [0 for _ in range(10)]})
}

experiment_configures = {'AttentiveFP_Aromaticity': attentivefp}
Ejemplo n.º 4
0
atom_featurizer = BaseAtomFeaturizer({
    'hv':
    ConcatFeaturizer([
        partial(
            atom_type_one_hot,
            allowable_set=[
                'C',
                'Br',
                'N',
                'O',
                'Cl',
                'F',
                'P',
                'S',
                'I',
                'Sn',
                'Se',
                'Si',
                'Ag',
                'Au',
                'Ni',
                'Zn',
                'Mg',
                'Co',
                'Fe',
                'Mn',
                'Cu',
                'B',
                'Sb'  # extra atom type from metal
            ],
            encode_unknown=True),
        partial(atom_degree_one_hot, allowable_set=list(range(6))),
        atom_formal_charge,
        atom_num_radical_electrons,
        partial(atom_hybridization_one_hot, encode_unknown=True),
        lambda atom: [0],  # A placeholder for aromatic information,
        atom_total_num_H_one_hot,
        chirality
    ])
})
Ejemplo n.º 5
0
from functools import partial
from rdkit import Chem

atom_featurizer = BaseAtomFeaturizer(
    featurizer_funcs={
        'hv':
        ConcatFeaturizer([
            partial(atom_degree_one_hot, allowable_set=[1, 2, 3, 4, 6]),
            partial(atom_type_one_hot,
                    allowable_set=[
                        'B', 'Br', 'C', 'Cl', 'F', 'H', 'I', 'N', 'O', 'P',
                        'S', 'Se', 'Si'
                    ]),
            atom_chiral_tag_one_hot,
            partial(atom_formal_charge_one_hot, allowable_set=[-1, 0, 1]),
            partial(atom_hybridization_one_hot,
                    allowable_set=[
                        Chem.rdchem.HybridizationType.S, Chem.rdchem.
                        HybridizationType.SP, Chem.rdchem.HybridizationType.
                        SP2, Chem.rdchem.HybridizationType.SP3,
                        Chem.rdchem.HybridizationType.SP3D2
                    ]),
            partial(atom_implicit_valence_one_hot,
                    allowable_set=list(range(4))),
            atom_is_aromatic_one_hot,
            atom_mass,
        ])
    })

if __name__ == '__main__':
    import pandas as pd
Ejemplo n.º 6
0
def main(args):
    
    # fix random seeds
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)
    
    # load CSV dataset
    smlstr = []
    logCMC = []
    with open("../data/dataset.csv") as csvDataFile:
        csvReader = csv.reader(csvDataFile)
        for row in csvReader:
            smlstr.append(row[0])
            logCMC.append(row[1])
    smlstr = np.asarray(smlstr)
    logCMC = np.asarray(logCMC, dtype="float")
    dataset_size = len(smlstr)
    all_ind = np.arange(dataset_size)

    # split into training and testing
    if args.randSplit:
        train_full_ind, test_ind, \
        smlstr_train, smlstr_test, \
        logCMC_train, logCMC_test = train_test_split(all_ind, smlstr, logCMC,
                                                     test_size=args.test_size,
                                                     random_state=args.seed)   
    else:
        if args.dataset in ["nonionic","all"]:
            if args.dataset == "nonionic":
                test_ind = np.array([8,14,26,31,43,54,57,68,72,80,99,110]) # for nonionic surfactants only
            elif args.dataset == "all":
                test_ind = np.array([8,14,26,31,43,54,57,68,72,80,99,110,125,132,140,150,164,171,178,185,192,197])
            train_full_ind = np.asarray([x for x in all_ind if x not in test_ind])
            np.random.shuffle(test_ind)
            np.random.shuffle(train_full_ind)
            smlstr_train  = smlstr[train_full_ind]
            smlstr_test = smlstr[test_ind]
            logCMC_train = logCMC[train_full_ind]
            logCMC_test = logCMC[test_ind]
        else:
            print("Using Random Splits")
            args.randSplit = True
            train_full_ind, test_ind, \
            smlstr_train, smlstr_test, \
            logCMC_train, logCMC_test = train_test_split(all_ind, smlstr, logCMC,
                                                         test_size=args.test_size,
                                                         random_state=args.seed)   

        
    # save train/test data and index corresponding to the original dataset
    pickle.dump(smlstr_train,open("../gnn_logs/smlstr_train.p","wb"))
    pickle.dump(smlstr_test,open("../gnn_logs/smlstr_test.p","wb"))
    pickle.dump(logCMC_train,open("../gnn_logs/logCMC_train.p","wb"))
    pickle.dump(logCMC_test,open("../gnn_logs/logCMC_test.p","wb"))
    pickle.dump(train_full_ind,open("../gnn_logs/original_ind_train_full.p","wb"))
    pickle.dump(test_ind,open("../gnn_logs/original_ind_test.p","wb"))
    rows = zip(train_full_ind,smlstr_train,logCMC_train)
    with open("../gnn_logs/dataset_train.csv",'w',newline='') as f:
        writer = csv.writer(f,delimiter=',')
        for row in rows:
            writer.writerow(row)
    rows = zip(test_ind,smlstr_test,logCMC_test)
    with open("../gnn_logs/dataset_test.csv",'w',newline='') as f:
        writer = csv.writer(f,delimiter=',')
        for row in rows:
            writer.writerow(row)
      
    train_size = len(smlstr_train)
    indices = list(range(train_size))
    
    if args.skip_cv == False:
        # K-fold CV setup
        kf = KFold(n_splits=args.cv, random_state=args.seed, shuffle=True)
        cv_index = 0
        index_list_train = []
        index_list_valid = []
        for train_indices, valid_indices in kf.split(indices):
            index_list_train.append(train_indices)
            index_list_valid.append(valid_indices)
            model = args.gnn_model(args.dim_input, args.unit_per_layer,1,False)
            model_arch = 'GCNReg'
            loss_fn = nn.MSELoss()
            
            # check gpu availability
            if args.gpu >= 0:
                model = model.cuda(args.gpu)
                loss_fn = loss_fn.cuda(args.gpu)
                cudnn.enabled = True
                cudnn.benchmark = True
                cudnn.deterministic = False
            optimizer = torch.optim.Adam(model.parameters(), args.lr)
            # training
    
                    
            if args.single_feat:
                from dgllife.utils import BaseAtomFeaturizer,atomic_number
                train_full_dataset = graph_dataset(smlstr_train,logCMC_train,node_enc=BaseAtomFeaturizer({'h': atomic_number}))
                test_dataset = graph_dataset(smlstr_test,logCMC_test,node_enc=BaseAtomFeaturizer({'h': atomic_number}))
                args.dim_input = 1
            else:
                train_full_dataset = graph_dataset(smlstr_train,logCMC_train)
                test_dataset = graph_dataset(smlstr_test,logCMC_test)
            train_sampler = SubsetRandomSampler(train_indices)
            valid_sampler = SubsetRandomSampler(valid_indices)
            train_loader = torch.utils.data.DataLoader(train_full_dataset, batch_size=args.batch_size,
                                                       sampler=train_sampler,
                                                       collate_fn=collate,
                                                       shuffle=False)
            val_loader = torch.utils.data.DataLoader(train_full_dataset, batch_size=args.batch_size,
                                                     sampler=valid_sampler,
                                                     collate_fn=collate,
                                                     shuffle=False)
            train_dataset = graph_dataset(smlstr_train[train_indices],logCMC_train[train_indices])
            valid_dataset = graph_dataset(smlstr_train[valid_indices],logCMC_train[valid_indices])
    
            fname = r"ep{}bs{}lr{}kf{}hu{}cvid{}".format(args.epochs, args.batch_size,
                                                               args.lr,
                                                               args.cv,
                                                               args.unit_per_layer, cv_index)
            
            best_rmse = 1000        
            if args.train:
                print("Training the model ...")
                stopper = EarlyStopping(mode='lower', patience=args.patience, filename=r'../gnn_logs/{}es.pth.tar'.format(fname)) # early stop model
                for epoch in range(args.start_epoch, args.epochs):
                    train_loss = train(train_loader, model, loss_fn, optimizer, epoch, args, fname)
                    rmse = validate(val_loader, model, epoch, args, fname)
                    is_best = rmse < best_rmse
                    best_rmse = min(rmse, best_rmse)
                    if is_best:
                        save_checkpoint({
                            'epoch': epoch + 1,
                            'model_arch': model_arch,
                            'state_dict': model.state_dict(),
                            'best_rmse': best_rmse,
                            'optimizer': optimizer.state_dict(),
                        }, fname)
                    if args.early_stop:
                        early_stop = stopper.step(train_loss, model)
                        if early_stop:
                            print("**********Early Stopping!")
                            break
    
    
            # test
            print("Testing the model ...")
            checkpoint = torch.load(r"../gnn_logs/{}.pth.tar".format(fname))
            args.start_epoch = 0
            best_rmse = checkpoint['best_rmse']
            model = args.gnn_model(args.dim_input, args.unit_per_layer,1,True)
            if args.gpu >= 0:
                model = model.cuda(args.gpu)
            model.load_state_dict(checkpoint['state_dict'])
            # if args.gpu < 0:
            #     model = model.cpu()
            # else:
            #     model = model.cuda(args.gpu)
            print("=> loaded checkpoint '{}' (epoch {}, rmse {})"
                  .format(fname, checkpoint['epoch'], best_rmse))
            cudnn.deterministic = True
            stage = 'testtest'
            predict(test_dataset, model, -1, args, fname, stage)
            stage = 'testtrain'
            predict(train_dataset, model, -1, args, fname, stage)
            stage = 'testval'
            predict(valid_dataset, model, -1, args, fname, stage)
            cv_index += 1
        pickle.dump(index_list_train,open("../gnn_logs/ind_train_list.p","wb"))
        pickle.dump(index_list_valid,open("../gnn_logs/ind_val_list.p","wb"))
        cv_index += 1

    else:
        model = args.gnn_model(args.dim_input, args.unit_per_layer,1,False)
        model_arch = 'GCNReg'
        loss_fn = nn.MSELoss()
        
        # check gpu availability
        if args.gpu >= 0:
            model = model.cuda(args.gpu)
            loss_fn = loss_fn.cuda(args.gpu)
            cudnn.enabled = True
            cudnn.benchmark = True
            cudnn.deterministic = False
        optimizer = torch.optim.Adam(model.parameters(), args.lr)
        # training

                
        if args.single_feat:
            from dgllife.utils import BaseAtomFeaturizer,atomic_number
            train_full_dataset = graph_dataset(smlstr_train,logCMC_train,node_enc=BaseAtomFeaturizer({'h': atomic_number}))
            test_dataset = graph_dataset(smlstr_test,logCMC_test,node_enc=BaseAtomFeaturizer({'h': atomic_number}))
            args.dim_input = 1
        else:
            train_full_dataset = graph_dataset(smlstr_train,logCMC_train)
            test_dataset = graph_dataset(smlstr_test,logCMC_test)
        train_loader = torch.utils.data.DataLoader(train_full_dataset, batch_size=args.batch_size,
                                                   collate_fn=collate,
                                                   shuffle=False)
        test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.batch_size,
                                                  collate_fn=collate,
                                                  shuffle=False)
        train_dataset = graph_dataset(smlstr_train,logCMC_train)
        fname = r"ep{}bs{}lr{}hu{}".format(args.epochs, args.batch_size,
                                                           args.lr,
                                                           args.unit_per_layer)
        
        best_rmse = 1000        
        if args.train:
            print("Training the model ...")
            stopper = EarlyStopping(mode='lower', patience=args.patience, filename=r'../gnn_logs/{}es.pth.tar'.format(fname)) # early stop model
            for epoch in range(args.start_epoch, args.epochs):
                train_loss = train(train_loader, model, loss_fn, optimizer, epoch, args, fname)
                rmse = validate(test_loader, model, epoch, args, fname)
                is_best = rmse < best_rmse
                best_rmse = min(rmse, best_rmse)
                if is_best:
                    save_checkpoint({
                        'epoch': epoch + 1,
                        'model_arch': model_arch,
                        'state_dict': model.state_dict(),
                        'best_rmse': best_rmse,
                        'optimizer': optimizer.state_dict(),
                    }, fname)
                if args.early_stop:
                    early_stop = stopper.step(train_loss, model)
                    if early_stop:
                        print("**********Early Stopping!")
                        break


        # test
        print("Testing the model ...")
        checkpoint = torch.load(r"../gnn_logs/{}.pth.tar".format(fname))
        args.start_epoch = 0
        best_rmse = checkpoint['best_rmse']
        model = args.gnn_model(args.dim_input, args.unit_per_layer,1,True)
        if args.gpu >= 0:
            model = model.cuda(args.gpu)
        model.load_state_dict(checkpoint['state_dict'])
        # if args.gpu < 0:
        #     model = model.cpu()
        # else:
        #     model = model.cuda(args.gpu)
        print("=> loaded checkpoint '{}' (epoch {}, rmse {})"
              .format(fname, checkpoint['epoch'], best_rmse))
        cudnn.deterministic = True
        stage = 'testtest'
        predict(test_dataset, model, -1, args, fname, stage)
        stage = 'testtrain'
        predict(train_dataset, model, -1, args, fname, stage)
        if args.early_stop:
            checkpoint = torch.load(r"../gnn_logs/{}es.pth.tar".format(fname))
            args.start_epoch = 0
            model = args.gnn_model(args.dim_input, args.unit_per_layer,1,True)
            if args.gpu >= 0:
                model = model.cuda(args.gpu)
            model.load_state_dict(checkpoint['model_state_dict'])
            train_dataset = graph_dataset(smlstr_train,logCMC_train)
            test_dataset = graph_dataset(smlstr_test,logCMC_test)
            cudnn.deterministic = True     
            stage = 'testtest'
            predict(test_dataset, model, -1, args, r"{}es".format(fname), stage)
            stage = 'testtrain'
            predict(train_dataset, model, -1, args, r"{}es".format(fname), stage)

        
    return