def data_transforming(traindf): #Transforming SMILES to MOL traindf['mol'] = traindf['SMILES sequence'].apply( lambda x: Chem.MolFromSmiles(x)) print('Molecular sentence:', mol2alt_sentence(traindf['mol'][1], radius=1)) print('\nMolSentence object:', MolSentence(mol2alt_sentence(traindf['mol'][1], radius=1))) print( '\nDfVec object:', DfVec( sentences2vec(MolSentence( mol2alt_sentence(traindf['mol'][1], radius=1)), model, unseen='UNK'))) #Constructing sentences traindf['sentence'] = traindf.apply( lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1) #Extracting embeddings to a numpy.array #Note that we always should mark unseen='UNK' in sentence2vec() so that model is taught how to handle unknown substructures traindf['mol2vec'] = [ DfVec(x) for x in sentences2vec(traindf['sentence'], model, unseen='UNK') ] X = np.array([x.vec for x in traindf['mol2vec']]) return X
def mol2vec(fin_name, fout_name, clean=False): #clean_data, removing smiles string can't convert to molecules #We may improve this latter. Only do once if clean: print('cleaning data...') clean_file(fin_name, fin_name) clean_data = pd.read_csv(fin_name) #Load pre-trained model model = word2vec.Word2Vec.load('./models/model_300dim.pkl') print('making vec data...') #convert to sentences mols = [Chem.MolFromSmiles(smi) for smi in clean_data['smiles'].values] sentences = [MolSentence(mol2alt_sentence(mol, 1)) for mol in mols] #convert to vectors vecs = [DfVec(x) for x in sentences2vec(sentences, model, unseen='UNK')] vec_values = np.array([v.vec for v in vecs]) # Form dataframe cols = ['vec_'+str(i) for i in range(300)] df = pd.DataFrame(vec_values, columns=cols) df.insert(0, "smiles", clean_data['smiles'].values, True) df.to_csv(fout_name) return vec_values
def process_ligands(self, ligands): XD = [] if self.drug_format == "labeled_smiles": if type(ligands) == OrderedDict: iterator = ligands.keys() else: iterator = range(ligands.shape[0]) for d in iterator: XD.append( label_smiles(ligands[d], self.SMILEN, self.charsmiset)) elif self.drug_format == "mol2vec": from gensim.models import word2vec from mol2vec.features import (MolSentence, mol2alt_sentence, sentences2vec) from rdkit.Chem import PandasTools word2vec_model = word2vec.Word2Vec.load(self.mol2vec_model_path) df_ligands = pd.DataFrame({"smiles": ligands}) PandasTools.AddMoleculeColumnToFrame(df_ligands, "smiles", "ROMol") dtc_train = df_ligands[df_ligands["ROMol"].notnull()] dtc_train.loc[:, "mol-sentence"] = dtc_train.apply( lambda x: MolSentence( mol2alt_sentence(x["ROMol"], self.mol2vec_radius)), axis=1, ) XD = sentences2vec(dtc_train["mol-sentence"], word2vec_model, unseen="UNK") return XD
def mol2vec_features(mol2vec_path, dataframe, smiles_col, target_col, pad_to): model = word2vec.Word2Vec.load(mol2vec_path) # validate smiles first! smiles_lst = dataframe[smiles_col].to_numpy() labels_lst = dataframe[target_col].to_numpy() idx = [] for i, s in enumerate(smiles_lst): try: mol = Chem.MolFromSmiles(s) if mol is None: continue except Exception as e: continue idx.append(i) smiles_lst = smiles_lst[np.array(idx)] labels_lst = labels_lst[np.array(idx)] # mol2vec embeddings mollst = [Chem.MolFromSmiles(x) for x in smiles_lst] sentences = [mol2alt_sentence(x, 1) for x in mollst] features = np.zeros([len(mollst), pad_to, model.vector_size]) labels = np.array(labels_lst) print("mean: ", labels.mean(), "std: ", labels.std()) for idx, sentence in enumerate(sentences): count = 0 for word in sentence: if count == pad_to: break try: features[idx, count] = model.wv[word] count += 1 except KeyError as e: pass assert features.shape[0] == labels.shape[0] return features, labels
def embed_single_smiles(smiles): model = word2vec.Word2Vec.load('data/model_300dim.pkl') mol = Chem.MolFromSmiles(smiles) sentences = sentences2vec(MolSentence(mol2alt_sentence(mol, 1)), model, unseen='UNK') return sentences
def mol2sentence(smiles_batch: List[str], vocab, args: Namespace) -> List[dict]: output_list = [] for smiles in smiles_batch: if smiles in SMILES_TO_SENTENCE: sentence = SMILES_TO_SENTENCE[smiles] else: mol = Chem.MolFromSmiles(smiles) if mol is not None: sentence = mol2alt_sentence(mol, radius=args.radius) SMILES_TO_SENTENCE[smiles] = sentence else: continue # convert to ids sentence = [ vocab.stoi.get(token, vocab.unk_index) for i, token in enumerate(sentence) ] sentence = [vocab.sos_index] + sentence + [vocab.eos_index] segment_label = ([1 for _ in range(len(sentence))])[:args.seq_len] input = sentence[:args.seq_len] padding = [vocab.pad_index for _ in range(args.seq_len - len(input))] input.extend(padding) segment_label.extend(padding) output = {'input': input, 'segment_label': segment_label} output = {key: torch.tensor(value) for key, value in output.items()} output_list.append(output) return output_list
def _parallel_job(smiles, r): """Helper function for joblib jobs """ if smiles is not None: # smiles = Chem.MolToSmiles(mol) mol = Chem.MolFromSmiles(smiles) sentence = mol2alt_sentence(mol, r) return " ".join(sentence)
def embed_smiles(smiles): model = word2vec.Word2Vec.load('data/model_300dim.pkl') mols = (Chem.MolFromSmiles(i) for i in smiles) sentences = [ sentences2vec(MolSentence(mol2alt_sentence(m, 1)), model, unseen='UNK') for m in mols ] return sentences
def featurize(ligand_data, trained_model, outpath): """Generate features from mol2vec model. Parameters ---------- ligand_data: (str) A path to a csv file containing ligand structure data. trained_model: (str) Path to a pickle file of a trained word2vec model. outpath: (str) Path for storing output files. """ data = pd.read_csv(ligand_data) # Create new column to store fingerprints data['words'] = np.zeros(len(data), dtype='object') # Read chemical structures ligands = (Chem.MolFromSmiles(x) for x in data['canonical_smiles']) # Generate fingerprints print("Generating molecular fingerprints.") i = 0 with tqdm(total=len(data)) as pbar: for l in ligands: fingerprint = mol2alt_sentence(l, 1) data['words'][i] = list(fingerprint) i += 1 pbar.update() pickle.dump(data, open(outpath + "/fingerprints.pkl", 'wb')) print("Finding unique fingerprints.") all_words = np.array( [word for sentence in data['words'] for word in sentence]) unique_words = np.unique(all_words) # Create a data frame of embeddings print("Storing embeddings.") model = word2vec.Word2Vec.load(trained_model) embeddings = {} for word in unique_words: try: embeddings[word] = model.wv.word_vec(word) except: embeddings[word] = np.zeros(300) embeddings = pd.DataFrame(embeddings) pickle.dump(embeddings, open(outpath + "/embeddings.pkl", 'wb')) # Create a data frame to store ligand vectors vectors = {} print("Generating vectors.") for mol in tqdm(data['molecule_chembl_id']): fingerprint = data.loc[data.molecule_chembl_id == mol]['words'] for sentence in fingerprint: components = embeddings[sentence] vec = np.sum(components, axis=1) vectors[mol] = vec vectors = pd.DataFrame(vectors).T print("Writing csv file.") pickle.dump(vectors, open(outpath + "/vectors.pkl", 'wb')) vectors.to_csv(outpath + "/ligand_vectors.csv")
def polymer_embeddings(cls, smile): sentences = [] model = word2vec.Word2Vec.load('regressor/POLYINFO_PI1M.pkl') sentence = MolSentence(mol2alt_sentence(Chem.MolFromSmiles(smile), 1)) sentences.append(sentence) PE_model = [ DfVec(x) for x in sentences2vec(sentences, model, unseen='UNK') ] PE = np.array([x.vec.tolist() for x in PE_model]) return PE
def vec_mol2vec_smile(smiles: List[str], mol2vec) -> np.ndarray: # TODO evaluate impact of radius alt_seqs = map(lambda x: mol2alt_sentence(Chem.MolFromSmiles(x), 1), smiles) vec_seqs = [] for seqs in alt_seqs: vec_seqs.append([get_embbed(x, mol2vec) for x in seqs]) return tf.keras.preprocessing.sequence.pad_sequences(vec_seqs, padding="post", truncating="post", dtype="float32")
def download_data(dev_mode: str, model: word2vec.Word2Vec) -> (np.ndarray, np.ndarray): """ Returns tuple X, y which are numpy arrays """ assert dev_mode.lower() == 'false' or dev_mode.lower() == 'true' if dev_mode.lower() == 'false': print('Using Actual Data...') data_path = os.path.join(args.data_dir, 'HIV.csv') df = pd.read_csv(data_path) df['sentence'] = df.apply(lambda x: MolSentence( mol2alt_sentence(Chem.MolFromSmiles(x['smiles']), 1)), axis=1) df['mol2vec'] = [ DfVec(x) for x in sentences2vec(df['sentence'], model, unseen='UNK') ] # convert dataframe into numpy array for training X = np.array([x.vec for x in df['mol2vec']]) y = np.array(df['HIV_active'].astype(int)) else: # use example data set data_path = os.path.join(args.data_dir, 'ames.sdf') df = PandasTools.LoadSDF(data_path) df['sentence'] = df.apply( lambda x: MolSentence(mol2alt_sentence(x['ROMol'], 1)), axis=1) df['mol2vec'] = [ DfVec(x) for x in sentences2vec(df['sentence'], model, unseen='UNK') ] # convert dataframe into numpy array for training X = np.array([x.vec for x in df['mol2vec']]) y = np.array(df['class'].astype(int)) return X, y
def mol2vec(data): x = data.drop(columns=['smiles', 'activity', 'mol']) model = word2vec.Word2Vec.load('model_300dim.pkl') data['sentence'] = data.apply( lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1) # Extracting embeddings to a numpy.array # Note that we always should mark unseen='UNK' in sentence2vec() so that model is taught how to handle unknown substructures data['mol2vec'] = [ DfVec(x) for x in sentences2vec(data['sentence'], model, unseen='UNK') ] x_mol = np.array([x.vec for x in data['mol2vec']]) x_mol = pd.DataFrame(x_mol) # Concatenating matrices of features new_data = pd.concat((x, x_mol), axis=1) return new_data
def forward(self, smiles_batch: List[str]) -> Tuple[torch.Tensor, torch.Tensor]: embs = [] lengths = [] max_seq_len = 0 batch_size = len(smiles_batch) for smiles in smiles_batch: try_emb = self.mapping.get(smiles, None) if try_emb is None: # try: mol = Chem.MolFromSmiles(smiles) sentence = mol2alt_sentence(mol, radius=1) emb = [] for word in sentence: try: try: vec = self.mol2vec.wv.word_vec(word) except AttributeError: vec = self.mol2vec[word] except KeyError: vec = self.unk_emb emb.append(vec) # (seq_len, embed_dim) emb = np.array(emb, dtype=np.float) seq_len = len(sentence) if seq_len > max_seq_len: max_seq_len = seq_len embs.append(emb) lengths.append(seq_len) # except: # print('Failed smiles {}'.format(smiles)) # embs: List[np.ndarray] emb_data = np.zeros((batch_size, max_seq_len, self.mol2vec_embed_dim), dtype=np.float) for emb_no, emb in enumerate(embs): emb_data[emb_no, :lengths[emb_no]] = emb emb_tensor = torch.Tensor(emb_data) length_data = np.array(lengths, dtype=np.int) length_tensor = torch.LongTensor(length_data) if torch.cuda.is_available(): emb_tensor = emb_tensor.cuda() length_tensor = length_tensor.cuda() if self.ffn is not None: emb_tensor = self.ffn(emb_tensor) return emb_tensor, length_tensor
def smiles2vector_duplicates_average(smiles_string): """ Convert SMILES to 300d embedding Args: smiles_string (string): single SMILES string Returns: embedding (numpy.ndarray): 300d mol vector array """ sentence = mol2alt_sentence(Chem.MolFromSmiles(smiles_string), radius=1) vec_node = 0 for i in range(len(sentence)): vec = mol2vec_model.wv[sentence[i]] vec_node += vec return vec_node / len(sentence)
def mol2vec_features(model, dataframe, smiles_col, target_col, pad_to): mollst = [Chem.MolFromSmiles(x) for x in dataframe[smiles_col]] sentences = [mol2alt_sentence(x, 1) for x in mollst] features = np.zeros([len(mollst), pad_to, model.vector_size]) labels = np.reshape(np.array(dataframe[target_col]), (-1, 1)) print("mean: ", labels.mean(), "std: ", labels.std()) for idx, sentence in enumerate(sentences): count = 0 for word in sentence: if count == pad_to: break try: features[idx, count] = model.wv[word] count += 1 except KeyError as e: pass assert features.shape[0] == labels.shape[0] return features, labels
def jak2(smile): mol = Chem.MolFromSmiles(smile) if not mol: return 0 #if mol.HasSubstructMatch(sb): # return 0 sentence = MolSentence(mol2alt_sentence(mol, 1)) fp = [ DfVec(x).vec.tolist() for x in sentences2vec(np.array([sentence]), model, unseen='UNK') ] #fp = Chem.GetMorganFingerprintAsBitVect(m, 3, nBits=1024) score = clf.predict(xgb.DMatrix(fp))[0] try: qed = QED.qed(mol) except: qed = 0 score = 1 * score + 0 * qed score = score * 0.9 + (np.random.random_sample() - 0.5) * 0.1 return score
def get_IC50(): """ Write a file containing the IC50, SMILES, SMILES embedding and protein embedding from the BindingDB dataset Input file size is 3,5Gb Output file size is around 25Gb """ # Get all protein sequences Protein = [] with open('data/BindingDB_All.tsv', encoding='utf-8') as i: for line in i: splitline = line.split("\t") Protein.append(splitline[37]) # Delete the header del Protein[0] # Embed the sequences protein_embed = embed_protein(100, Protein, 3, 5, 5) model = word2vec.Word2Vec.load('data/model_300dim.pkl') with open('data/BindingDB_All.tsv', encoding='utf-8') as i: with open('data/BindingDB_IC50.tsv', 'w') as o: for z, line in enumerate(i): splitline = line.split("\t") # Write the header if z == 0: o.write( "IC50" + "\t" + "Ligand SMILES" + "\t" + "SMILES embedding" + "\t" + "Protein embedding" + "\n") # Write the info only when the IC50 and the SMILES code are valid else: if splitline[9] != ("" and 0): if ("<" not in splitline[9]) and (">" not in splitline[9]): try: m = Chem.MolFromSmiles(splitline[1]) smiles_embedding = sentences2vec(MolSentence(mol2alt_sentence(m, 1)), model, unseen='UNK') o.write(str(splitline[9]) + "\t" + str(splitline[1]) + "\t" + str( smiles_embedding.tolist()) + "\t" + str(next(protein_embed)) + "\n") except TypeError: next(protein_embed)
def label(path, label_file, model, title): data = load_raw_data(path, [label_file])["test"] x = data.drop(columns=["smiles", "activity", 'mol']) process_model = word2vec.Word2Vec.load('model_300dim.pkl') data['sentence'] = data.apply( lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1) # Extracting embeddings to a numpy.array # Note that we always should mark unseen='UNK' in sentence2vec() so that model is taught how to handle unknown substructures data['mol2vec'] = [ DfVec(x) for x in sentences2vec(data['sentence'], process_model, unseen='UNK') ] x_mol = np.array([x.vec for x in data['mol2vec']]) x_mol = pd.DataFrame(x_mol) # Concatenating matrices of features x_test = pd.concat((x, x_mol), axis=1) x_test = StandardScaler().fit_transform(x_test) preds = model.predict_proba(x_test)[:, 1] write_data = data.drop(columns=["smiles"]) # print(type(write_data)) # print(write_data) write_data['activity'] = preds
def get_fp(smiles): fp = [] model = model = word2vec.Word2Vec.load( '/content/drive/My Drive/model_300dim.pkl') df = pd.DataFrame(columns=['SMILES']) processed_indices = [] invalid_indices = [] for i in range(len(smiles)): mol = smiles[i] tmp = np.array(mol2image(mol, n=2048)) if np.isnan(tmp[0]): invalid_indices.append(i) else: fp.append(tmp) df = df.append({'SMILES': mol}, ignore_index=True) processed_indices.append(i) df['mol'] = df['SMILES'].apply(lambda x: Chem.MolFromSmiles(x)) df['sentence'] = df.apply( lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1) df['mol2vec'] = [ DfVec(x) for x in sentences2vec(df['sentence'], model, unseen='UNK') ] X = np.array([x.vec for x in df['mol2vec']]) return X, processed_indices, invalid_indices
d_mols = {} l_num = 1 r_num = 1 for fname in ligands_folder: if 'actives' in fname: receptor_name = fname.split('-actives')[0].split('/')[-1] label = 1 elif 'decoys' in fname: receptor_name = fname.split('-decoys')[0].split('/')[-1] label = 0 if receptor_name + '_' + str(label) not in d_mols.keys(): d_mols[receptor_name + '_' + str(label)] = [] df = PandasTools.LoadSDF(fname) df['sentence'] = df.apply( lambda x: MolSentence(mol2alt_sentence(x['ROMol'], 1)), axis=1) df['mol2vec'] = [ DfVec(x) for x in sentences2vec(df['sentence'], model, unseen='UNK') ] X = np.array([x.vec for x in df['mol2vec']]) d_mols[receptor_name + '_' + str(label)] = X print(str(l_num), " th receptor") l_num = l_num + 1 save_obj(d_mols, directory + 'train_test_data/' + date_str + '/ligand_dict_mols') else: ligand_dict = load_obj(savepath + '/ligand_dict_mols')
import pandas as pd import numpy as np from rdkit import Chem from mol2vec.features import mol2alt_sentence,MolSentence from gensim.models import word2vec import torch data = pd.read_csv("../training_smiles.csv") y = np.array(data["ACTIVE"].astype(int)) data = data[["SMILES"]] data["SMILES_str"] = data["SMILES"] data["SMILES"] = data["SMILES"].apply(lambda x: Chem.MolFromSmiles(x)) model = word2vec.Word2Vec.load('../models/model_300dim.pkl') data['sentence'] = data.apply(lambda x: MolSentence(mol2alt_sentence(x['SMILES'], 1)), axis=1) data = [x.sentence for x in data['sentence']] vocabs = [x for x in model.wv.index2word if x != 'UNK'] vocab_size = len(vocabs)+1 embed_size = model.wv.vector_size weight = torch.zeros(vocab_size, embed_size) word_to_idx = {word: i+1 for i, word in enumerate(vocabs) } word_to_idx['UNK']=0 idx_to_word = {i+1: word for i, word in enumerate(vocabs) } idx_to_word[0]='UNK' vocabs.append('UNK') for i in range(len(vocabs)): index = word_to_idx[vocabs[i]] weight[index, :] = torch.from_numpy(model.wv.get_vector(idx_to_word[word_to_idx[vocabs[i]]]))
def smiles2sentence(smiles): mol = Chem.MolFromSmiles(smiles) sentence = mol2alt_sentence(mol, 1) return sentence
from rdkit import Chem from mol2vec.features import mol2alt_sentence from gensim.models import word2vec from tqdm import tqdm # Read data data = pd.read_csv("all_unique_ligands.csv") ligands = (Chem.MolFromSmiles(x) for x in data['canonical_smiles']) # Create new column to store fingerprints data['words'] = np.zeros(len(data), dtype='object') print("Generating molecular fingerprints") i = 0 with tqdm(total=len(data)) as pbar: for l in ligands: fingerprint = mol2alt_sentence(l, 1) data['words'][i] = list(fingerprint) i += 1 pbar.update() pickle.dump(data, open("fingerprints.pkl", 'wb')) # Find all unique words print("Finding unique fingerprints") all_words = np.array([word for sentence in data['words'] for word in sentence]) unique_words = np.unique(all_words) # Create a data frame of embeddings print("Storing embeddings") model = word2vec.Word2Vec.load('model_300dim.pkl') embeddings = {} for word in unique_words:
def storeMolecule(): pass """ Test running """ directory = "/home/noh/Desktop/CURRENT_WORK_IN_PROGRESS/Chemiinformatics/RDKIT/rdkit/Docs/Book/data" sdf_file = 'bzr.sdf' process = rdkit_processdf( directory, sdf_file) # Initialization of the class that reads the sdf file molList = process.returnMol() molSmiles = process.MoltoSmiles() mol2VecList = [ mol2alt_sentence(x, 1) for x in molList ] # Using mol2vec to encode molecules as sentences, meaning that each substructure # represents a word # Defining the number of hidden layers and the number of nodes inside them n_hidden1 = 300 n_hidden2 = 100 n_hidden3 = 100 """ --------------------------------------------- | Fingerprinting and Molecular Similarity | --------------------------------------------- The RDkit has a variety of built-in functionality for generating fingerprints and using them to calculate molecular similarity. The RDKit has a variety for
d_mols={} l_num=1 r_num=1 for fname in ligands_folder: if 'actives' in fname: receptor_name=fname.split('-actives')[0].split('/')[-1] label=1 elif 'decoys' in fname: receptor_name=fname.split('-decoys')[0].split('/')[-1] label=0 if receptor_name+'_'+str(label) not in d_mols.keys(): d_mols[receptor_name+'_'+str(label)]=[] df = PandasTools.LoadSDF(fname) df['sentence'] = df.apply(lambda x: MolSentence(mol2alt_sentence(x['ROMol'], 1)), axis=1) df['mol2vec'] = [DfVec(x) for x in sentences2vec(df['sentence'], model, unseen='UNK')] X = np.array([x.vec for x in df['mol2vec']]) d_mols[receptor_name+'_'+str(label)]=X print(str(l_num), " th receptor") l_num = l_num+1 save_obj(d_mols, directory + 'train_test_data/'+date_str+'/ligand_dict_mols') else: ligand_dict=load_obj(savepath+'/ligand_dict_mols') ##################################################### #Data ##################################################### if generate_images:
plt.title("MAE {}, MSE {}".format(round(mae, 4), round(mse, 4))) plt.show() print('MAE score:', round(mae, 4)) print('MSE score:', round(mse,4)) #Read and initialize the Lipophilicity database mdf= pd.read_csv('Lipophilicity_df_revised.csv') target = mdf['exp'] mdf.drop(columns='exp',inplace=True) mdf['mol'] = mdf['smiles'].apply(lambda x: Chem.MolFromSmiles(x)) #Loading pre-trained model via word2vec model = word2vec.Word2Vec.load('model_300dim.pkl') mols = MolSentence(mol2alt_sentence(mdf['mol'][1], radius=1)) keys = set(model.wv.vocab.keys()) mnk = set(mols)&keys s2v = sentences2vec(MolSentence(mol2alt_sentence(mdf['mol'][1], radius=1)), model, unseen='UNK') mdf['sentence'] = mdf.apply(lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1) mdf['mol2vec'] = [DfVec(x) for x in sentences2vec(mdf['sentence'], model, unseen='UNK')] X = np.array([x.vec for x in mdf['mol2vec']]) X.shape y = target.values y.shape #For the full training set using the substructure of vectors from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec model = word2vec.Word2Vec.load('model_300dim.pkl')
plt.show() print('MAE score:', round(mae, 4)) print('MSE score:', round(mse, 4)) mdf = pd.read_csv('Lipophilicity_df_revised.csv') target = mdf['exp'] mdf.drop(columns='exp', inplace=True) mdf['mol'] = mdf['smiles'].apply(lambda x: Chem.MolFromSmiles(x)) #Loading pre-trained model via word2vec from gensim.models import word2vec model = word2vec.Word2Vec.load('model_300dim.pkl') mols = MolSentence(mol2alt_sentence(mdf['mol'][1], radius=1)) keys = set(model.wv.vocab.keys()) mnk = set(mols) & keys s2v = sentences2vec(MolSentence(mol2alt_sentence(mdf['mol'][1], radius=1)), model, unseen='UNK') mdf['sentence'] = mdf.apply( lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1) mdf['mol2vec'] = [ DfVec(x) for x in sentences2vec(mdf['sentence'], model, unseen='UNK') ] X = np.array([x.vec for x in mdf['mol2vec']]) X.shape
##from mol2vec.helpers import depict_identifier, plot_2D_vectors, IdentifierTable, mol_to_svg aa_smis = ['CC(N)C(=O)O', 'N=C(N)NCCCC(N)C(=O)O', 'NC(=O)CC(N)C(=O)O', 'NC(CC(=O)O)C(=O)O', 'NC(CS)C(=O)O', 'NC(CCC(=O)O)C(=O)O', 'NC(=O)CCC(N)C(=O)O', 'NCC(=O)O', 'NC(Cc1cnc[nH]1)C(=O)O', 'CCC(C)C(N)C(=O)O', 'CC(C)CC(N)C(=O)O', 'NCCCCC(N)C(=O)O', 'CSCCC(N)C(=O)O', 'NC(Cc1ccccc1)C(=O)O', 'O=C(O)C1CCCN1', 'NC(CO)C(=O)O', 'CC(O)C(N)C(=O)O', 'NC(Cc1c[nH]c2ccccc12)C(=O)O', 'NC(Cc1ccc(O)cc1)C(=O)O', 'CC(C)C(N)C(=O)O'] aa_codes = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLU', 'GLN', 'GLY', 'HIS', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', 'TRP', 'TYR', 'VAL'] aas = [Chem.MolFromSmiles(x) for x in aa_smis] from gensim.models import word2vec model = word2vec.Word2Vec.load('model_300dim.pkl') aa_sentences = [mol2alt_sentence(x, 1) for x in aas] aalist={} index=0 for x in aa_sentences: aa= np.zeros(300) for y in x: aa=aa+model.wv.word_vec(y) aalist[aa_codes[index]]=aa #print (aa) index=index+1 for name in aa_codes: print (name, aalist[name] ) #print (name, ' '.join( str(x) for x in list[name]))
def molecule2sentence(molecule, radius=1): sentence = mol2alt_sentence(molecule, radius=radius) return sentence