def data_transforming(traindf): #Transforming SMILES to MOL traindf['mol'] = traindf['SMILES sequence'].apply( lambda x: Chem.MolFromSmiles(x)) print('Molecular sentence:', mol2alt_sentence(traindf['mol'][1], radius=1)) print('\nMolSentence object:', MolSentence(mol2alt_sentence(traindf['mol'][1], radius=1))) print( '\nDfVec object:', DfVec( sentences2vec(MolSentence( mol2alt_sentence(traindf['mol'][1], radius=1)), model, unseen='UNK'))) #Constructing sentences traindf['sentence'] = traindf.apply( lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1) #Extracting embeddings to a numpy.array #Note that we always should mark unseen='UNK' in sentence2vec() so that model is taught how to handle unknown substructures traindf['mol2vec'] = [ DfVec(x) for x in sentences2vec(traindf['sentence'], model, unseen='UNK') ] X = np.array([x.vec for x in traindf['mol2vec']]) return X
def mol2vec(fin_name, fout_name, clean=False): #clean_data, removing smiles string can't convert to molecules #We may improve this latter. Only do once if clean: print('cleaning data...') clean_file(fin_name, fin_name) clean_data = pd.read_csv(fin_name) #Load pre-trained model model = word2vec.Word2Vec.load('./models/model_300dim.pkl') print('making vec data...') #convert to sentences mols = [Chem.MolFromSmiles(smi) for smi in clean_data['smiles'].values] sentences = [MolSentence(mol2alt_sentence(mol, 1)) for mol in mols] #convert to vectors vecs = [DfVec(x) for x in sentences2vec(sentences, model, unseen='UNK')] vec_values = np.array([v.vec for v in vecs]) # Form dataframe cols = ['vec_'+str(i) for i in range(300)] df = pd.DataFrame(vec_values, columns=cols) df.insert(0, "smiles", clean_data['smiles'].values, True) df.to_csv(fout_name) return vec_values
def polymer_embeddings(cls, smile): sentences = [] model = word2vec.Word2Vec.load('regressor/POLYINFO_PI1M.pkl') sentence = MolSentence(mol2alt_sentence(Chem.MolFromSmiles(smile), 1)) sentences.append(sentence) PE_model = [ DfVec(x) for x in sentences2vec(sentences, model, unseen='UNK') ] PE = np.array([x.vec.tolist() for x in PE_model]) return PE
def download_data(dev_mode: str, model: word2vec.Word2Vec) -> (np.ndarray, np.ndarray): """ Returns tuple X, y which are numpy arrays """ assert dev_mode.lower() == 'false' or dev_mode.lower() == 'true' if dev_mode.lower() == 'false': print('Using Actual Data...') data_path = os.path.join(args.data_dir, 'HIV.csv') df = pd.read_csv(data_path) df['sentence'] = df.apply(lambda x: MolSentence( mol2alt_sentence(Chem.MolFromSmiles(x['smiles']), 1)), axis=1) df['mol2vec'] = [ DfVec(x) for x in sentences2vec(df['sentence'], model, unseen='UNK') ] # convert dataframe into numpy array for training X = np.array([x.vec for x in df['mol2vec']]) y = np.array(df['HIV_active'].astype(int)) else: # use example data set data_path = os.path.join(args.data_dir, 'ames.sdf') df = PandasTools.LoadSDF(data_path) df['sentence'] = df.apply( lambda x: MolSentence(mol2alt_sentence(x['ROMol'], 1)), axis=1) df['mol2vec'] = [ DfVec(x) for x in sentences2vec(df['sentence'], model, unseen='UNK') ] # convert dataframe into numpy array for training X = np.array([x.vec for x in df['mol2vec']]) y = np.array(df['class'].astype(int)) return X, y
def mol2vec(data): x = data.drop(columns=['smiles', 'activity', 'mol']) model = word2vec.Word2Vec.load('model_300dim.pkl') data['sentence'] = data.apply( lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1) # Extracting embeddings to a numpy.array # Note that we always should mark unseen='UNK' in sentence2vec() so that model is taught how to handle unknown substructures data['mol2vec'] = [ DfVec(x) for x in sentences2vec(data['sentence'], model, unseen='UNK') ] x_mol = np.array([x.vec for x in data['mol2vec']]) x_mol = pd.DataFrame(x_mol) # Concatenating matrices of features new_data = pd.concat((x, x_mol), axis=1) return new_data
def jak2(smile): mol = Chem.MolFromSmiles(smile) if not mol: return 0 #if mol.HasSubstructMatch(sb): # return 0 sentence = MolSentence(mol2alt_sentence(mol, 1)) fp = [ DfVec(x).vec.tolist() for x in sentences2vec(np.array([sentence]), model, unseen='UNK') ] #fp = Chem.GetMorganFingerprintAsBitVect(m, 3, nBits=1024) score = clf.predict(xgb.DMatrix(fp))[0] try: qed = QED.qed(mol) except: qed = 0 score = 1 * score + 0 * qed score = score * 0.9 + (np.random.random_sample() - 0.5) * 0.1 return score
def label(path, label_file, model, title): data = load_raw_data(path, [label_file])["test"] x = data.drop(columns=["smiles", "activity", 'mol']) process_model = word2vec.Word2Vec.load('model_300dim.pkl') data['sentence'] = data.apply( lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1) # Extracting embeddings to a numpy.array # Note that we always should mark unseen='UNK' in sentence2vec() so that model is taught how to handle unknown substructures data['mol2vec'] = [ DfVec(x) for x in sentences2vec(data['sentence'], process_model, unseen='UNK') ] x_mol = np.array([x.vec for x in data['mol2vec']]) x_mol = pd.DataFrame(x_mol) # Concatenating matrices of features x_test = pd.concat((x, x_mol), axis=1) x_test = StandardScaler().fit_transform(x_test) preds = model.predict_proba(x_test)[:, 1] write_data = data.drop(columns=["smiles"]) # print(type(write_data)) # print(write_data) write_data['activity'] = preds
def get_fp(smiles): fp = [] model = model = word2vec.Word2Vec.load( '/content/drive/My Drive/model_300dim.pkl') df = pd.DataFrame(columns=['SMILES']) processed_indices = [] invalid_indices = [] for i in range(len(smiles)): mol = smiles[i] tmp = np.array(mol2image(mol, n=2048)) if np.isnan(tmp[0]): invalid_indices.append(i) else: fp.append(tmp) df = df.append({'SMILES': mol}, ignore_index=True) processed_indices.append(i) df['mol'] = df['SMILES'].apply(lambda x: Chem.MolFromSmiles(x)) df['sentence'] = df.apply( lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1) df['mol2vec'] = [ DfVec(x) for x in sentences2vec(df['sentence'], model, unseen='UNK') ] X = np.array([x.vec for x in df['mol2vec']]) return X, processed_indices, invalid_indices
r_num = 1 for fname in ligands_folder: if 'actives' in fname: receptor_name = fname.split('-actives')[0].split('/')[-1] label = 1 elif 'decoys' in fname: receptor_name = fname.split('-decoys')[0].split('/')[-1] label = 0 if receptor_name + '_' + str(label) not in d_mols.keys(): d_mols[receptor_name + '_' + str(label)] = [] df = PandasTools.LoadSDF(fname) df['sentence'] = df.apply( lambda x: MolSentence(mol2alt_sentence(x['ROMol'], 1)), axis=1) df['mol2vec'] = [ DfVec(x) for x in sentences2vec(df['sentence'], model, unseen='UNK') ] X = np.array([x.vec for x in df['mol2vec']]) d_mols[receptor_name + '_' + str(label)] = X print(str(l_num), " th receptor") l_num = l_num + 1 save_obj(d_mols, directory + 'train_test_data/' + date_str + '/ligand_dict_mols') else: ligand_dict = load_obj(savepath + '/ligand_dict_mols') ##################################################### #Data
# df['mol'] = df['mol'].apply(lambda x: Chem.AddHs(x)) mdf = df.drop(columns=['aff']) y = df['aff'].values from gensim.models import word2vec from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec model = word2vec.Word2Vec.load('./model_300dim.pkl') #Constructing sentences mdf['sentence'] = mdf.apply( lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1) #Extracting embeddings to a numpy.array #Note that we always should mark unseen='UNK' in sentence2vec() so that model is taught how to handle unknown substructures mdf['mol2vec'] = [ DfVec(x) for x in sentences2vec(mdf['sentence'], model, unseen='UNK') ] x = np.array([x.vec for x in mdf['mol2vec']]) from sklearn import preprocessing x_scaled = x min_max_scaler = preprocessing.MinMaxScaler() x_scaled = min_max_scaler.fit_transform(x) print(x_scaled.shape) # # train_df = pd.DataFrame(x_scaled) # from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression # mae_a = []
# # step2 Handling of uncommon "words" # insert_unk(corpus=result_file_path2, out_corpus=result_file_path3) # # # step3 train molecule vector # train_word2vec_model(infile_name=result_file_path3, outfile_name=model_fp, # vector_size=100, window=10, min_count=3, n_jobs=4, method='cbow') # get vector of each molecule by mol2vec model # mol with fragment id sentence print('Start to read downsampled mol sentences and load model...') mol_info = pd.read_csv(dowmsampled_coupus_fp, header=None) # model_fp = os.path.join(include_small_dataset_dir, 'mol2vec_related', 'mol2vec_model.pkl') model = load_trained_model(model_fp) # print(mol_info.loc[4568802, '0']) mol_info['sentence'] = mol_info.apply( lambda x: MolSentence([str(i) for i in x[0].split(' ')]), axis=1) # print(mol_info) mol_info['mol2vec_related'] = [ DfVec(x) for x in sentences2vec(mol_info['sentence'], model) ] cid2vec = {} cid2smiles = pd.read_csv(result_file_path1) inx2cid = cid2smiles['0'].to_dict() for inx in mol_info.index.to_list(): cid = inx2cid[inx] cid2vec[cid] = list(mol_info.loc[inx, 'mol2vec_related'].vec) cid2vec_df = pd.DataFrame.from_dict(cid2vec, orient='index') print(cid2vec_df.shape) # result_file2 = os.path.join(result_dir, 'step4_selected_mol2vec_model_mol2vec.csv') cid2vec_df.to_csv(mol2vec_fp, header=False, float_format='%.3f')
d_mols={} l_num=1 r_num=1 for fname in ligands_folder: if 'actives' in fname: receptor_name=fname.split('-actives')[0].split('/')[-1] label=1 elif 'decoys' in fname: receptor_name=fname.split('-decoys')[0].split('/')[-1] label=0 if receptor_name+'_'+str(label) not in d_mols.keys(): d_mols[receptor_name+'_'+str(label)]=[] df = PandasTools.LoadSDF(fname) df['sentence'] = df.apply(lambda x: MolSentence(mol2alt_sentence(x['ROMol'], 1)), axis=1) df['mol2vec'] = [DfVec(x) for x in sentences2vec(df['sentence'], model, unseen='UNK')] X = np.array([x.vec for x in df['mol2vec']]) d_mols[receptor_name+'_'+str(label)]=X print(str(l_num), " th receptor") l_num = l_num+1 save_obj(d_mols, directory + 'train_test_data/'+date_str+'/ligand_dict_mols') else: ligand_dict=load_obj(savepath+'/ligand_dict_mols') ##################################################### #Data ##################################################### if generate_images: receptors = sorted(glob.glob(directory + 'pockets_dude_tiff_128/'+date_str+'/*.png'))
def create_features(data, types="train"): if types == "train": y = np.array(data['ACTIVE'].astype(int)) elif types == "test": y = None data = data[["SMILES"]] data["SMILES_str"] = data["SMILES"] data["SMILES"] = data["SMILES"].apply(lambda x: Chem.MolFromSmiles(x)) data["NumAtoms"] = data["SMILES"].apply( lambda x: x.GetNumAtoms()) #l.HeavyAtomCount(m) data["ExactMolWt"] = data["SMILES"].apply(lambda x: d.CalcExactMolWt(x)) data["fr_Al_COO"] = data["SMILES"].apply(lambda x: f.fr_Al_COO(x)) data["HsNumAtoms"] = data["SMILES"].apply( lambda x: Chem.AddHs(x).GetNumAtoms()) #to have the hydrogens explicitly present BondType = [[str(x.GetBondType()) for x in m.GetBonds()] for m in data["SMILES"]] BondType = [" ".join(x) for x in BondType] vec = CountVectorizer().fit(BondType) train_tfidf = vec.transform(BondType).todense() # 转化为更直观的一般矩阵 vocabulary = vec.vocabulary_ train_tfidf = pd.DataFrame(train_tfidf) train_tfidf.columns = vocabulary data = pd.concat([data, train_tfidf], axis=1) #data.columns #['SMILES', 'ACTIVE', 'SMILES_str', 'NumAtoms', 'ExactMolWt', 'fr_Al_COO','HsNumAtoms', 'double', 'single', 'aromatic', 'triple'] traindata = data[[ 'NumAtoms', 'ExactMolWt', 'fr_Al_COO', 'HsNumAtoms', 'double', 'single', 'aromatic', 'triple' ]] finger = [ np.array(AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=512)) for x in data["SMILES"] ] finger = pd.DataFrame(finger) finger.columns = ["morgan_" + str(x) for x in finger.columns] model = word2vec.Word2Vec.load('models/model_300dim.pkl') data['sentence'] = data.apply( lambda x: MolSentence(mol2alt_sentence(x['SMILES'], 1)), axis=1) m2v = [ DfVec(x) for x in sentences2vec(data['sentence'], model, unseen='UNK') ] m2v = np.array([x.vec for x in m2v]) m2v = pd.DataFrame(m2v) m2v.columns = ["m2v_" + str(x) for x in m2v.columns] datadict = { "Morgan": finger, "Despcritor": traindata, "molvec": m2v, 'y': y } return datadict
#Loading pre-trained model via word2vec from gensim.models import word2vec model = word2vec.Word2Vec.load('./Datasets_final/q3/model_300dim.pkl') # In[6]: from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec from gensim.models import word2vec print('Molecular sentence:', mol2alt_sentence(mdf['mol'][1], radius=1)) print('\nMolSentence object:', MolSentence(mol2alt_sentence(mdf['mol'][1], radius=1))) print( '\nDfVec object:', DfVec( sentences2vec(MolSentence(mol2alt_sentence(mdf['mol'][1], radius=1)), model, unseen='UNK'))) # In[7]: #Constructing sentences mdf['sentence'] = mdf.apply( lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1) #Extracting embeddings to a numpy.array #Note that we always should mark unseen='UNK' in sentence2vec() so that model is taught how to handle unknown substructures mdf['mol2vec'] = [ DfVec(x) for x in sentences2vec(mdf['sentence'], model, unseen='UNK') ] X = np.array([x.vec for x in mdf['mol2vec']]) y = target.values
from rdkit import Chem #Transforming SMILES to MOL df1['mol'] = df1['SMILES sequence'].apply(lambda x: Chem.MolFromSmiles(x)) from google.colab import drive drive.mount('/content/drive') from gensim.models import word2vec model = word2vec.Word2Vec.load('/content/drive/My Drive/model_300dim.pkl') from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec from gensim.models import word2vec print('Molecular sentence:', mol2alt_sentence(df1['mol'][1], radius=1)) print('\nMolSentence object:', MolSentence(mol2alt_sentence(df1['mol'][1], radius=1))) print('\nDfVec object:',DfVec(sentences2vec(MolSentence(mol2alt_sentence(df1['mol'][1], radius=1)), model, unseen='UNK'))) #Constructing sentences df1['sentence']=df1.apply(lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1) #Extracting embeddings to a numpy.array #Note that we always should mark unseen='UNK' in sentence2vec() so that model is taught how to handle unknown substructures df1['mol2vec'] = [DfVec(x) for x in sentences2vec(df1['sentence'], model, unseen='UNK')] entire_train_data= np.array([x.vec for x in df1['mol2vec']]) entire_train_labels= labels.values entire_train_data.shape from sklearn.model_selection import train_test_split from sklearn.metrics import mean_absolute_error, mean_squared_error
from keras import metrics, optimizers from keras.callbacks.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau from sklearn.metrics import r2_score, mean_absolute_error, median_absolute_error dataset = pd.read_csv('Data/data.csv') train_set, test_set = split_dataset(dataset, 0.1) model = word2vec.Word2Vec.load('Mol2Vec/pretrain/model_300dim.pkl') train_mol = [Chem.MolFromSmiles(x) for x in train_set['SMILES']] test_mol = [Chem.MolFromSmiles(x) for x in test_set['SMILES']] train_sent = [mol2alt_sentence(x, 1) for x in train_mol] test_sent = [mol2alt_sentence(x, 1) for x in test_mol] train_vec = [ DfVec(x).vec for x in sentences2vec(train_sent, model, unseen='UNK') ] test_vec = [ DfVec(x).vec for x in sentences2vec(test_sent, model, unseen='UNK') ] train_vec = np.array(train_vec) test_vec = np.array(test_vec) # train model layer_in = Input(shape=(train_vec.shape[1], )) layer_dense = layer_in n_nodes = 32 for j in range(3): layer_dense = Dense(int(n_nodes), activation="relu")(layer_dense) layer_output = Dense(1, activation="linear")(layer_dense)