Exemple #1
0
def main():
    print('\nTRAIN START!!!')
    with Timer() as t2:
        train_set, valid_set = pickle_out(start=0, amount=1, random_state=None)
        # print(train_set[5].shape)
        # print(valid_set[5].shape)
        print('train:', Counter(train_set[5].view(-1).cpu().numpy().tolist()))
        print('valid:', Counter(valid_set[5].view(-1).cpu().numpy().tolist()))
        train_set, valid_set = AllData_pk(train_set), AllData_pk(valid_set)
        print('pickle:')

    with Timer() as t3:
        print(len(train_set), len(valid_set))
        train_loader = DataLoader(train_set,
                                  batch_size=BATCH_SIZE,
                                  shuffle=True,
                                  drop_last=True)
        valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE)
        print('data load:')

    with Timer() as t4:
        # net = GraphConvAutoEncoder(hid_dim_m=128, hid_dim_p=128, n_class=2)
        # net = net.fit(train_loader, epochs=100)
        net = QSAR(hid_dim_m=216, hid_dim_p=512, n_class=2)
        net = net.fit(train_loader,
                      valid_loader,
                      epochs=N_EPOCH,
                      path='output/gcnn')
        print('model:')
def main():
    with Timer() as t1:
        data_path = '/home/ubuntu/wangzhongxu/gcnn2/NGFP/dataset'
        pd_filename = 'pd_test.txt'
        pd_lst = data_parser(data_path, pd_filename)
        tmp_lst = [0 for _ in pd_lst]
        print('data parse:')

    with Timer() as t2:
        train_set, valid_set, _, _ = train_test_split(pd_lst,
                                                      tmp_lst,
                                                      test_size=0.2,
                                                      random_state=0)
        train_set, valid_set = AllData(train_set, data_path), AllData(
            valid_set, data_path)
        print('tensorize:')

    with Timer() as t2:
        print(len(train_set), len(valid_set))
        train_loader = DataLoader(train_set,
                                  batch_size=BATCH_SIZE,
                                  shuffle=True,
                                  drop_last=True)
        valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE)
        print('data load:')

    with Timer() as t2:
        # net = GraphConvAutoEncoder(hid_dim_m=128, hid_dim_p=128, n_class=2)
        # net = net.fit(train_loader, epochs=100)
        net = QSAR(hid_dim_m=216, hid_dim_p=512, n_class=2)
        net = net.fit(train_loader,
                      valid_loader,
                      epochs=N_EPOCH,
                      path='output/gcnn')
        print('model:')
Exemple #3
0
def try_load_net(model_file=None):
    if model_file is not None:
        model_file = Path(model_file)
        if model_file.exists() and model_file.is_file():
            net = torch.load(args.model, map_location=dev)
        else:
            raise FileNotFoundError
    else:  # random large weights
        net = QSAR(hid_dim=128, n_class=1, max_degree=6)
        enlarge_weights(net, -1e4, 1e4)
    return net.to(dev)
Exemple #4
0
def main(reg=False, is_extra=True):
    pair = ['TARGET_CHEMBLID', 'CMPD_CHEMBLID', 'PCHEMBL_VALUE',
            'CANONICAL_SMILES', 'ACTIVITY_COMMENT', 'STANDARD_TYPE', 'RELATION']
    df = pd.read_csv('data/AR_ALL.csv')
    cmps = df.set_index(pair[1])[pair[3]].drop_duplicates()
    df = df[pair].set_index(pair[0:2])
    df['PCHEMBL_VALUE'] = df.groupby(pair[0:2]).mean()
    numery = df[pair[2:4]].dropna().drop_duplicates()

    comments = df[(df.ACTIVITY_COMMENT.str.contains('Not Active') == True)]
    inhibits = df[(df.STANDARD_TYPE == 'Inhibition') & df.RELATION.isin(['<', '<='])]
    relations = df[df.STANDARD_TYPE.isin(['EC50', 'IC50', 'Kd', 'Ki']) & df.RELATION.isin(['>', '>='])]
    binary = pd.concat([comments, inhibits, relations], axis=0)
    binary = binary[~binary.index.isin(numery.index)]
    binary['PCHEMBL_VALUE'] = 3.99
    binary = binary[pair[2:4]].dropna().drop_duplicates()

    df = numery.append(binary)
    df = df[pair[2]].unstack(pair[0])
    df = df.sample(len(df))

    if reg:
        test = binary[pair[2]].sample(len(binary)).unstack(pair[0])
    else:
        df = (df > 6.5).astype(float)
        test = df.sample(len(df)//8)
        df = df.drop(test.index)
    data = df if is_extra else numery.sample(len(numery))

    indep_set = MolData(cmps.loc[test.index], test.values)
    indep_loader = DataLoader(indep_set, batch_size=BATCH_SIZE)
    folds = KFold(5).split(data)
    cvs = np.zeros(data.shape)
    inds = np.zeros(test.shape)
    out = 'output/gcn%s' % ('_' + subset if subset else '')
    for i, (trained, valided) in enumerate(folds):
        trained, valided = data.iloc[trained], data.iloc[valided]
        train_set = MolData(cmps.loc[trained.index], trained.values)
        valid_set = MolData(cmps.loc[valided.index], valided.values)
        train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
        valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE)
        net = QSAR(hid_dim=128, n_class=data.shape[1]).to(util.dev)
        net = net.fit(train_loader, valid_loader, epochs=N_EPOCH, path='%s_%d' % (out, i))
        print('Evaluation of Loss in validation Set: %f' % net.evaluate(valid_loader))
        print('Evaluation of Loss in independent Set: %f' % net.evaluate(indep_loader))
        cvs[valided] = net.predict(valid_loader)
        inds += net.predict(indep_loader)

    data_score, test_score = pd.DataFrame(), pd.DataFrame()
    data_score['LABEL'] = data.stack()
    test_score['LABEL'] = test.stack()
    data_score['SCORE'] = pd.DataFrame(cvs, index=data.index, columns=data.columns).stack()
    test_score['SCORE'] = pd.DataFrame(inds, index=test.index, columns=test.columns).stack()
    data_score.to_csv(out + '.cv.txt')
    test_score.to_csv(out + '.ind.txt')
Exemple #5
0
 def build_data_net(args, target):
     if args.fp_method == FP_METHODS[0]:
         #""" CFP """
         data = SmileData(SMILES, target, fp_len=FP_LEN, radius=4)
         net = lambda: MLP(hid_dim=FP_LEN, n_class=1)
         return data, net
     elif args.fp_method == FP_METHODS[1]:
         #""" NFP """
         net = lambda: QSAR(hid_dim=128, n_class=1)
         data = MolData(SMILES, target)
         return data, net
     else:
         raise NotImplementedError
Exemple #6
0
def create_net(hid_dim, n_class, pre_trained=None):
    """
        pre_trained := the pretrained model file path
    """
    if pre_trained is None:
        # create a new QSAR network
        net = QSAR(hid_dim=128, n_class=n_class)
    else:
        if not Path(pre_trained).exists():
            raise FileNotFoundError
        prenet = torch.load(pre_trained, map_location=dev)
        net = PreFP(prenet.nfp, hid_dim=128, n_class=n_class)

    return net

    return net
Exemple #7
0
def main():
    # 设置随机数种子
    setup_seed(0)

    print('\nTRAIN START!!!')
    with Timer() as t2:
        train_set, valid_set = pickle_out(start=0, amount=10, random_state=0)
        train_amount = 5000
        pos_amount, neg_amount = 50, 50 # 5 pos and 5 neg
        pos_lst, neg_lst = [], []

        print('train:',Counter(train_set[5][:train_amount].view(-1).cpu().numpy().tolist()))
        print('valid:',Counter(valid_set[5].view(-1).cpu().numpy().tolist()))
        train_set, valid_set = AllData_pk(train_set), AllData_pk(valid_set)

        print(train_set[0][1].shape)
        return

        for valid_item in valid_set:
            if pos_amount!=0 and valid_item[5].item()==1:
                pos_amount-=1
                pos_lst.append(valid_item)
                continue
            if neg_amount!=0 and valid_item[5].item()==0:
                neg_amount-=1
                neg_lst.append(valid_item)
                continue
            if pos_amount==0 and neg_amount==0:
                break
        print('valid: Counter 0.0:  {}, 1.0: {}'.format(len(pos_lst),len(neg_lst)))
        valid_set = pos_lst + neg_lst
        print('pickle:')
    with Timer() as t3:
        # print('train data: {}, vaild data: {}\n'.format(len(train_set), len(valid_set)))
        print('train data: {}, vaild data: {}\n'.format(train_amount, len(valid_set)))
        # for i in range(train_amount):
        #     break
        #     if i<7: continue

        #     print('### data i')

        #     print('标签:',train_set[i][-1])

        #     sum = 0
        #     for j in range(6):
        #         sum += (train_set[i][j]!=train_set[i][j]).sum().item()
        #     print('异常值个数:',sum)

        #     print('张量最大值:',train_set[i][0].max(),train_set[i][1].max(),train_set[i][2].max(),train_set[i][3].max(),train_set[i][4].max(),train_set[i][5].max())
        #     print('张量最小值',train_set[i][0].min(),train_set[i][1].min(),train_set[i][2].min(),train_set[i][3].min(),train_set[i][4].min(),train_set[i][5].min())

        #     # print('张量:',train_set[i][0],train_set[i][1],train_set[i][2],train_set[i][3],train_set[i][4],train_set[i][5])

        t_set = [train_set[i] for i in range(train_amount)]
        train_loader = DataLoader(t_set, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
        valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE)
        print('data load:')
    with Timer() as t4:
        # net = GraphConvAutoEncoder(hid_dim_m=128, hid_dim_p=128, n_class=2)
        # net = net.fit(train_loader, epochs=100)
        net = QSAR(hid_dim_m=216, hid_dim_p=512, n_class=1)
        net = net.fit(train_loader, valid_loader, epochs=N_EPOCH, path='output/gcnn')
        print('model:')
Exemple #8
0
                        "--method",
                        help="choose the fingerprint method to compute\
                        similarity score",
                        default="nfp",
                        choices=METHODS)
    parser.add_argument("--model",
                        help="choose the saved model file for nfp\
                        method. If not specified, large random weights would\
                        be used",
                        type=str)

    args = parser.parse_args()
    if args.method == METHODS[0]:
        fp1 = calc_circular_fp(args.s1, radius=6, fp_len=FP_LEN)
        fp2 = calc_circular_fp(args.s2, radius=6, fp_len=FP_LEN)
        print(1 - tanimoto_distance(fp1, fp2))
    elif args.method == METHODS[1]:
        if args.model is not None:
            model_file = Path(args.model)
            if model_file.exists() and model_file.is_file():
                net = torch.load(args.model)
            else:
                raise FileNotFoundError
        else:
            net = QSAR(hid_dim=FP_LEN, n_class=1, max_degree=6)
            enlarge_weights(net, -1e4, 1e4)

        tmp = tensorise_smiles([args.s1, args.s2])
        fp1, fp2 = calc_neural_fp(tmp, net)
        print(1 - tanimoto_distance(fp1, fp2))
    tmp = df['smiles'][0]
    print(get_circular_fp(tmp))
    exit()

    # Plot with a random weight and 2048 length as in Figure3Left
    gcn_act = ['sigmoid', 'relu', 'tanh']
    gop_act = ['sigmoid', 'tanh', 'softmax']
    large_weights = [(-1e7, 1e7), (0, 1e7), (-1e3, 1e3), (-10, 10)]
    max_degs = [1, 6]
    res = {}
    for a1, a2, bnds, rd in its.product(gcn_act, gop_act, large_weights,
                                        max_degs):
        SEED, FP_LEN = 7, 1 << 11
        net = QSAR(hid_dim=FP_LEN,
                   n_class=1,
                   max_degree=rd,
                   gcn_activation=a1,
                   gop_activation=a2)
        print("nbnds", bnds)
        change_net_to_weights(net.nfp, *bnds)
        tmp = calc_distance(net,
                            data,
                            df['smiles'],
                            FP_LEN,
                            sample_sz=500,
                            SEED=7)
        tmp = calc_corr(tmp)
        res[f"gcn-{a1}_gop-{a2}_weights-{bnds}_radius-{rd}"] = tmp
        print(f"gcn-{a1}_gop-{a2}_weights-{bnds}_radius-{rd}", tmp)

    with open('./output.json', 'w') as fp:
def compute_neural_fingerprints(smiles=None, smiles_file=None, start_index=0, batch_size=0, out_file=None, bad_file=None, save_csv=False, overwrite=False, save_gzip=False, model_file=None, max_degree=6):
    import os
    import logging
    import pickle
    from rdkit import Chem
    from rdkit.Chem import AllChem
    import csv
    import numpy as np
    import torch
    from NeuralGraph.model import QSAR
    from NeuralGraph.util import dev, enlarge_weights

    if not csv:
        raise Exception("Neural FPs only support CSV output")

    if not overwrite and  os.path.exists(out_file):
        raise Exception("File exists: %s" % out_file)

    #Load the model
    if model_file is not None:
        model_file = Path(model_file)
        if model_file.exists() and model_file.is_file():
            net = torch.load(model_file, map_location=dev)
        else:
            raise FileNotFoundError
    else: # random large weights
        net = QSAR(hid_dim=128, n_class=1, max_degree=6)
        enlarge_weights(net, -1e4, 1e4)

    net = net.to(dev)

    # read smiles
    if smiles_file: 
        with open(smiles_file) as current:
            current.seek(start_index)
            smiles = [current.readline() for i in range(batch_size)]
   
    sep = ","
    bad = []
    results = []
    
    for s in smiles:
        try: 
            mol_tuple = s.split(',')
            dataset = mol_tuple[0].rstrip()
            identifier = mol_tuple[1].rstrip()
            sml = mol_tuple[2].rstrip()
       
            good = True
            mol = Chem.MolFromSmiles(sml)
            if mol:
                atoms = mol.GetAtoms()
                for atom in atoms:
                    if atom.GetDegree() >= max_degree:
                        bad.append(mol_tuple)
                        good = False
            else:
                bad.append(mol_tuple)
                good = False
            if good:
                fp = np.concatenate(net.calc_nfp([sml]))
                #fp = net.calc_nfp([sml])
                fp_ = ':'.join("{:.7f}".format(x) for x in fp)
                results.append((dataset, identifier, sml, fp_))
        except: 
            bad.append(s)

    with open(out_file, 'w') as output_file:
        writer = csv.writer(output_file, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        writer.writerows(results)
    if bad_file and len(bad) > 0:
         with open(bad_file, 'w') as b_file:
             b_writer = csv.writer(b_file, delimiter=',') # quoting=csv.QUOTE_MINIMAL)
             b_writer.writerows(bad)

    return out_file