def main(): print('\nTRAIN START!!!') with Timer() as t2: train_set, valid_set = pickle_out(start=0, amount=1, random_state=None) # print(train_set[5].shape) # print(valid_set[5].shape) print('train:', Counter(train_set[5].view(-1).cpu().numpy().tolist())) print('valid:', Counter(valid_set[5].view(-1).cpu().numpy().tolist())) train_set, valid_set = AllData_pk(train_set), AllData_pk(valid_set) print('pickle:') with Timer() as t3: print(len(train_set), len(valid_set)) train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, drop_last=True) valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE) print('data load:') with Timer() as t4: # net = GraphConvAutoEncoder(hid_dim_m=128, hid_dim_p=128, n_class=2) # net = net.fit(train_loader, epochs=100) net = QSAR(hid_dim_m=216, hid_dim_p=512, n_class=2) net = net.fit(train_loader, valid_loader, epochs=N_EPOCH, path='output/gcnn') print('model:')
def main(): with Timer() as t1: data_path = '/home/ubuntu/wangzhongxu/gcnn2/NGFP/dataset' pd_filename = 'pd_test.txt' pd_lst = data_parser(data_path, pd_filename) tmp_lst = [0 for _ in pd_lst] print('data parse:') with Timer() as t2: train_set, valid_set, _, _ = train_test_split(pd_lst, tmp_lst, test_size=0.2, random_state=0) train_set, valid_set = AllData(train_set, data_path), AllData( valid_set, data_path) print('tensorize:') with Timer() as t2: print(len(train_set), len(valid_set)) train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, drop_last=True) valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE) print('data load:') with Timer() as t2: # net = GraphConvAutoEncoder(hid_dim_m=128, hid_dim_p=128, n_class=2) # net = net.fit(train_loader, epochs=100) net = QSAR(hid_dim_m=216, hid_dim_p=512, n_class=2) net = net.fit(train_loader, valid_loader, epochs=N_EPOCH, path='output/gcnn') print('model:')
def try_load_net(model_file=None): if model_file is not None: model_file = Path(model_file) if model_file.exists() and model_file.is_file(): net = torch.load(args.model, map_location=dev) else: raise FileNotFoundError else: # random large weights net = QSAR(hid_dim=128, n_class=1, max_degree=6) enlarge_weights(net, -1e4, 1e4) return net.to(dev)
def main(reg=False, is_extra=True): pair = ['TARGET_CHEMBLID', 'CMPD_CHEMBLID', 'PCHEMBL_VALUE', 'CANONICAL_SMILES', 'ACTIVITY_COMMENT', 'STANDARD_TYPE', 'RELATION'] df = pd.read_csv('data/AR_ALL.csv') cmps = df.set_index(pair[1])[pair[3]].drop_duplicates() df = df[pair].set_index(pair[0:2]) df['PCHEMBL_VALUE'] = df.groupby(pair[0:2]).mean() numery = df[pair[2:4]].dropna().drop_duplicates() comments = df[(df.ACTIVITY_COMMENT.str.contains('Not Active') == True)] inhibits = df[(df.STANDARD_TYPE == 'Inhibition') & df.RELATION.isin(['<', '<='])] relations = df[df.STANDARD_TYPE.isin(['EC50', 'IC50', 'Kd', 'Ki']) & df.RELATION.isin(['>', '>='])] binary = pd.concat([comments, inhibits, relations], axis=0) binary = binary[~binary.index.isin(numery.index)] binary['PCHEMBL_VALUE'] = 3.99 binary = binary[pair[2:4]].dropna().drop_duplicates() df = numery.append(binary) df = df[pair[2]].unstack(pair[0]) df = df.sample(len(df)) if reg: test = binary[pair[2]].sample(len(binary)).unstack(pair[0]) else: df = (df > 6.5).astype(float) test = df.sample(len(df)//8) df = df.drop(test.index) data = df if is_extra else numery.sample(len(numery)) indep_set = MolData(cmps.loc[test.index], test.values) indep_loader = DataLoader(indep_set, batch_size=BATCH_SIZE) folds = KFold(5).split(data) cvs = np.zeros(data.shape) inds = np.zeros(test.shape) out = 'output/gcn%s' % ('_' + subset if subset else '') for i, (trained, valided) in enumerate(folds): trained, valided = data.iloc[trained], data.iloc[valided] train_set = MolData(cmps.loc[trained.index], trained.values) valid_set = MolData(cmps.loc[valided.index], valided.values) train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, drop_last=True) valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE) net = QSAR(hid_dim=128, n_class=data.shape[1]).to(util.dev) net = net.fit(train_loader, valid_loader, epochs=N_EPOCH, path='%s_%d' % (out, i)) print('Evaluation of Loss in validation Set: %f' % net.evaluate(valid_loader)) print('Evaluation of Loss in independent Set: %f' % net.evaluate(indep_loader)) cvs[valided] = net.predict(valid_loader) inds += net.predict(indep_loader) data_score, test_score = pd.DataFrame(), pd.DataFrame() data_score['LABEL'] = data.stack() test_score['LABEL'] = test.stack() data_score['SCORE'] = pd.DataFrame(cvs, index=data.index, columns=data.columns).stack() test_score['SCORE'] = pd.DataFrame(inds, index=test.index, columns=test.columns).stack() data_score.to_csv(out + '.cv.txt') test_score.to_csv(out + '.ind.txt')
def build_data_net(args, target): if args.fp_method == FP_METHODS[0]: #""" CFP """ data = SmileData(SMILES, target, fp_len=FP_LEN, radius=4) net = lambda: MLP(hid_dim=FP_LEN, n_class=1) return data, net elif args.fp_method == FP_METHODS[1]: #""" NFP """ net = lambda: QSAR(hid_dim=128, n_class=1) data = MolData(SMILES, target) return data, net else: raise NotImplementedError
def create_net(hid_dim, n_class, pre_trained=None): """ pre_trained := the pretrained model file path """ if pre_trained is None: # create a new QSAR network net = QSAR(hid_dim=128, n_class=n_class) else: if not Path(pre_trained).exists(): raise FileNotFoundError prenet = torch.load(pre_trained, map_location=dev) net = PreFP(prenet.nfp, hid_dim=128, n_class=n_class) return net return net
def main(): # 设置随机数种子 setup_seed(0) print('\nTRAIN START!!!') with Timer() as t2: train_set, valid_set = pickle_out(start=0, amount=10, random_state=0) train_amount = 5000 pos_amount, neg_amount = 50, 50 # 5 pos and 5 neg pos_lst, neg_lst = [], [] print('train:',Counter(train_set[5][:train_amount].view(-1).cpu().numpy().tolist())) print('valid:',Counter(valid_set[5].view(-1).cpu().numpy().tolist())) train_set, valid_set = AllData_pk(train_set), AllData_pk(valid_set) print(train_set[0][1].shape) return for valid_item in valid_set: if pos_amount!=0 and valid_item[5].item()==1: pos_amount-=1 pos_lst.append(valid_item) continue if neg_amount!=0 and valid_item[5].item()==0: neg_amount-=1 neg_lst.append(valid_item) continue if pos_amount==0 and neg_amount==0: break print('valid: Counter 0.0: {}, 1.0: {}'.format(len(pos_lst),len(neg_lst))) valid_set = pos_lst + neg_lst print('pickle:') with Timer() as t3: # print('train data: {}, vaild data: {}\n'.format(len(train_set), len(valid_set))) print('train data: {}, vaild data: {}\n'.format(train_amount, len(valid_set))) # for i in range(train_amount): # break # if i<7: continue # print('### data i') # print('标签:',train_set[i][-1]) # sum = 0 # for j in range(6): # sum += (train_set[i][j]!=train_set[i][j]).sum().item() # print('异常值个数:',sum) # print('张量最大值:',train_set[i][0].max(),train_set[i][1].max(),train_set[i][2].max(),train_set[i][3].max(),train_set[i][4].max(),train_set[i][5].max()) # print('张量最小值',train_set[i][0].min(),train_set[i][1].min(),train_set[i][2].min(),train_set[i][3].min(),train_set[i][4].min(),train_set[i][5].min()) # # print('张量:',train_set[i][0],train_set[i][1],train_set[i][2],train_set[i][3],train_set[i][4],train_set[i][5]) t_set = [train_set[i] for i in range(train_amount)] train_loader = DataLoader(t_set, batch_size=BATCH_SIZE, shuffle=True, drop_last=True) valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE) print('data load:') with Timer() as t4: # net = GraphConvAutoEncoder(hid_dim_m=128, hid_dim_p=128, n_class=2) # net = net.fit(train_loader, epochs=100) net = QSAR(hid_dim_m=216, hid_dim_p=512, n_class=1) net = net.fit(train_loader, valid_loader, epochs=N_EPOCH, path='output/gcnn') print('model:')
"--method", help="choose the fingerprint method to compute\ similarity score", default="nfp", choices=METHODS) parser.add_argument("--model", help="choose the saved model file for nfp\ method. If not specified, large random weights would\ be used", type=str) args = parser.parse_args() if args.method == METHODS[0]: fp1 = calc_circular_fp(args.s1, radius=6, fp_len=FP_LEN) fp2 = calc_circular_fp(args.s2, radius=6, fp_len=FP_LEN) print(1 - tanimoto_distance(fp1, fp2)) elif args.method == METHODS[1]: if args.model is not None: model_file = Path(args.model) if model_file.exists() and model_file.is_file(): net = torch.load(args.model) else: raise FileNotFoundError else: net = QSAR(hid_dim=FP_LEN, n_class=1, max_degree=6) enlarge_weights(net, -1e4, 1e4) tmp = tensorise_smiles([args.s1, args.s2]) fp1, fp2 = calc_neural_fp(tmp, net) print(1 - tanimoto_distance(fp1, fp2))
tmp = df['smiles'][0] print(get_circular_fp(tmp)) exit() # Plot with a random weight and 2048 length as in Figure3Left gcn_act = ['sigmoid', 'relu', 'tanh'] gop_act = ['sigmoid', 'tanh', 'softmax'] large_weights = [(-1e7, 1e7), (0, 1e7), (-1e3, 1e3), (-10, 10)] max_degs = [1, 6] res = {} for a1, a2, bnds, rd in its.product(gcn_act, gop_act, large_weights, max_degs): SEED, FP_LEN = 7, 1 << 11 net = QSAR(hid_dim=FP_LEN, n_class=1, max_degree=rd, gcn_activation=a1, gop_activation=a2) print("nbnds", bnds) change_net_to_weights(net.nfp, *bnds) tmp = calc_distance(net, data, df['smiles'], FP_LEN, sample_sz=500, SEED=7) tmp = calc_corr(tmp) res[f"gcn-{a1}_gop-{a2}_weights-{bnds}_radius-{rd}"] = tmp print(f"gcn-{a1}_gop-{a2}_weights-{bnds}_radius-{rd}", tmp) with open('./output.json', 'w') as fp:
def compute_neural_fingerprints(smiles=None, smiles_file=None, start_index=0, batch_size=0, out_file=None, bad_file=None, save_csv=False, overwrite=False, save_gzip=False, model_file=None, max_degree=6): import os import logging import pickle from rdkit import Chem from rdkit.Chem import AllChem import csv import numpy as np import torch from NeuralGraph.model import QSAR from NeuralGraph.util import dev, enlarge_weights if not csv: raise Exception("Neural FPs only support CSV output") if not overwrite and os.path.exists(out_file): raise Exception("File exists: %s" % out_file) #Load the model if model_file is not None: model_file = Path(model_file) if model_file.exists() and model_file.is_file(): net = torch.load(model_file, map_location=dev) else: raise FileNotFoundError else: # random large weights net = QSAR(hid_dim=128, n_class=1, max_degree=6) enlarge_weights(net, -1e4, 1e4) net = net.to(dev) # read smiles if smiles_file: with open(smiles_file) as current: current.seek(start_index) smiles = [current.readline() for i in range(batch_size)] sep = "," bad = [] results = [] for s in smiles: try: mol_tuple = s.split(',') dataset = mol_tuple[0].rstrip() identifier = mol_tuple[1].rstrip() sml = mol_tuple[2].rstrip() good = True mol = Chem.MolFromSmiles(sml) if mol: atoms = mol.GetAtoms() for atom in atoms: if atom.GetDegree() >= max_degree: bad.append(mol_tuple) good = False else: bad.append(mol_tuple) good = False if good: fp = np.concatenate(net.calc_nfp([sml])) #fp = net.calc_nfp([sml]) fp_ = ':'.join("{:.7f}".format(x) for x in fp) results.append((dataset, identifier, sml, fp_)) except: bad.append(s) with open(out_file, 'w') as output_file: writer = csv.writer(output_file, delimiter=',', quoting=csv.QUOTE_MINIMAL) writer.writerows(results) if bad_file and len(bad) > 0: with open(bad_file, 'w') as b_file: b_writer = csv.writer(b_file, delimiter=',') # quoting=csv.QUOTE_MINIMAL) b_writer.writerows(bad) return out_file