Esempio n. 1
0
def data_transforming(traindf):
    #Transforming SMILES to MOL
    traindf['mol'] = traindf['SMILES sequence'].apply(
        lambda x: Chem.MolFromSmiles(x))

    print('Molecular sentence:', mol2alt_sentence(traindf['mol'][1], radius=1))
    print('\nMolSentence object:',
          MolSentence(mol2alt_sentence(traindf['mol'][1], radius=1)))
    print(
        '\nDfVec object:',
        DfVec(
            sentences2vec(MolSentence(
                mol2alt_sentence(traindf['mol'][1], radius=1)),
                          model,
                          unseen='UNK')))
    #Constructing sentences
    traindf['sentence'] = traindf.apply(
        lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1)

    #Extracting embeddings to a numpy.array
    #Note that we always should mark unseen='UNK' in sentence2vec() so that model is taught how to handle unknown substructures
    traindf['mol2vec'] = [
        DfVec(x)
        for x in sentences2vec(traindf['sentence'], model, unseen='UNK')
    ]
    X = np.array([x.vec for x in traindf['mol2vec']])
    return X
Esempio n. 2
0
    def process_ligands(self, ligands):
        XD = []

        if self.drug_format == "labeled_smiles":
            if type(ligands) == OrderedDict:
                iterator = ligands.keys()
            else:
                iterator = range(ligands.shape[0])

            for d in iterator:
                XD.append(
                    label_smiles(ligands[d], self.SMILEN, self.charsmiset))

        elif self.drug_format == "mol2vec":
            from gensim.models import word2vec
            from mol2vec.features import (MolSentence, mol2alt_sentence,
                                          sentences2vec)
            from rdkit.Chem import PandasTools

            word2vec_model = word2vec.Word2Vec.load(self.mol2vec_model_path)
            df_ligands = pd.DataFrame({"smiles": ligands})

            PandasTools.AddMoleculeColumnToFrame(df_ligands, "smiles", "ROMol")
            dtc_train = df_ligands[df_ligands["ROMol"].notnull()]
            dtc_train.loc[:, "mol-sentence"] = dtc_train.apply(
                lambda x: MolSentence(
                    mol2alt_sentence(x["ROMol"], self.mol2vec_radius)),
                axis=1,
            )
            XD = sentences2vec(dtc_train["mol-sentence"],
                               word2vec_model,
                               unseen="UNK")

        return XD
def embed_single_smiles(smiles):
    model = word2vec.Word2Vec.load('data/model_300dim.pkl')
    mol = Chem.MolFromSmiles(smiles)
    sentences = sentences2vec(MolSentence(mol2alt_sentence(mol, 1)),
                              model,
                              unseen='UNK')
    return sentences
def mol2vec(fin_name, fout_name, clean=False):
	
	#clean_data, removing smiles string can't convert to molecules 
	#We may improve this latter. Only do once 

	if clean:
		print('cleaning data...')
		clean_file(fin_name, fin_name)

	clean_data = pd.read_csv(fin_name)

	#Load pre-trained model 
	model = word2vec.Word2Vec.load('./models/model_300dim.pkl')

	print('making vec data...')
	#convert to sentences 
	mols = [Chem.MolFromSmiles(smi) for smi in clean_data['smiles'].values]
	sentences = [MolSentence(mol2alt_sentence(mol, 1)) for mol in mols]

	#convert to vectors 
	vecs = [DfVec(x) for x in sentences2vec(sentences, model, unseen='UNK')]
	vec_values = np.array([v.vec for v in vecs])

	# Form dataframe 
	cols = ['vec_'+str(i) for i in range(300)]
	df = pd.DataFrame(vec_values, columns=cols)
	df.insert(0, "smiles", clean_data['smiles'].values, True) 

	df.to_csv(fout_name)


	return vec_values
def embed_smiles(smiles):
    model = word2vec.Word2Vec.load('data/model_300dim.pkl')
    mols = (Chem.MolFromSmiles(i) for i in smiles)
    sentences = [
        sentences2vec(MolSentence(mol2alt_sentence(m, 1)), model, unseen='UNK')
        for m in mols
    ]
    return sentences
Esempio n. 6
0
 def polymer_embeddings(cls, smile):
     sentences = []
     model = word2vec.Word2Vec.load('regressor/POLYINFO_PI1M.pkl')
     sentence = MolSentence(mol2alt_sentence(Chem.MolFromSmiles(smile), 1))
     sentences.append(sentence)
     PE_model = [
         DfVec(x) for x in sentences2vec(sentences, model, unseen='UNK')
     ]
     PE = np.array([x.vec.tolist() for x in PE_model])
     return PE
Esempio n. 7
0
def download_data(dev_mode: str,
                  model: word2vec.Word2Vec) -> (np.ndarray, np.ndarray):
    """
    Returns tuple X, y which are numpy arrays
    """
    assert dev_mode.lower() == 'false' or dev_mode.lower() == 'true'

    if dev_mode.lower() == 'false':
        print('Using Actual Data...')
        data_path = os.path.join(args.data_dir, 'HIV.csv')
        df = pd.read_csv(data_path)
        df['sentence'] = df.apply(lambda x: MolSentence(
            mol2alt_sentence(Chem.MolFromSmiles(x['smiles']), 1)),
                                  axis=1)
        df['mol2vec'] = [
            DfVec(x)
            for x in sentences2vec(df['sentence'], model, unseen='UNK')
        ]

        # convert dataframe into numpy array for training
        X = np.array([x.vec for x in df['mol2vec']])
        y = np.array(df['HIV_active'].astype(int))
    else:
        # use example data set
        data_path = os.path.join(args.data_dir, 'ames.sdf')
        df = PandasTools.LoadSDF(data_path)
        df['sentence'] = df.apply(
            lambda x: MolSentence(mol2alt_sentence(x['ROMol'], 1)), axis=1)
        df['mol2vec'] = [
            DfVec(x)
            for x in sentences2vec(df['sentence'], model, unseen='UNK')
        ]

        # convert dataframe into numpy array for training
        X = np.array([x.vec for x in df['mol2vec']])
        y = np.array(df['class'].astype(int))

    return X, y
def mol2vec(data):
    x = data.drop(columns=['smiles', 'activity', 'mol'])
    model = word2vec.Word2Vec.load('model_300dim.pkl')
    data['sentence'] = data.apply(
        lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1)
    # Extracting embeddings to a numpy.array
    # Note that we always should mark unseen='UNK' in sentence2vec() so that model is taught how to handle unknown substructures
    data['mol2vec'] = [
        DfVec(x) for x in sentences2vec(data['sentence'], model, unseen='UNK')
    ]
    x_mol = np.array([x.vec for x in data['mol2vec']])
    x_mol = pd.DataFrame(x_mol)
    # Concatenating matrices of features
    new_data = pd.concat((x, x_mol), axis=1)
    return new_data
Esempio n. 9
0
def sentence2vec(sentence, mode='vec', unseen='UNK'):
    if mode == 'vec':
        return sentences2vec([sentence], model, unseen='UNK')[0]
    else:
        keys = set(model.wv.vocab.keys())

        if unseen:
            unseen_vec = model.wv.word_vec(unseen)
            x = [
                model.wv.word_vec(y) if y in set(sentence)
                & keys else unseen_vec for y in sentence
            ]
        else:
            x = [
                model.wv.word_vec(y) for y in sentence
                if y in set(sentence) & keys
            ]

        return np.array(x)
Esempio n. 10
0
def jak2(smile):

    mol = Chem.MolFromSmiles(smile)
    if not mol:
        return 0
    #if mol.HasSubstructMatch(sb):
    #    return 0
    sentence = MolSentence(mol2alt_sentence(mol, 1))
    fp = [
        DfVec(x).vec.tolist()
        for x in sentences2vec(np.array([sentence]), model, unseen='UNK')
    ]
    #fp = Chem.GetMorganFingerprintAsBitVect(m, 3, nBits=1024)
    score = clf.predict(xgb.DMatrix(fp))[0]
    try:
        qed = QED.qed(mol)
    except:
        qed = 0
    score = 1 * score + 0 * qed
    score = score * 0.9 + (np.random.random_sample() - 0.5) * 0.1
    return score
Esempio n. 11
0
def get_IC50():
    """
    Write a file containing the IC50, SMILES, SMILES embedding and protein embedding from the BindingDB dataset
    Input file size is 3,5Gb
    Output file size is around 25Gb
    """
    # Get all protein sequences
    Protein = []
    with open('data/BindingDB_All.tsv', encoding='utf-8') as i:
        for line in i:
            splitline = line.split("\t")
            Protein.append(splitline[37])
    # Delete the header
    del Protein[0]
    # Embed the sequences
    protein_embed = embed_protein(100, Protein, 3, 5, 5)

    model = word2vec.Word2Vec.load('data/model_300dim.pkl')
    with open('data/BindingDB_All.tsv', encoding='utf-8') as i:
        with open('data/BindingDB_IC50.tsv', 'w') as o:
            for z, line in enumerate(i):
                splitline = line.split("\t")

                # Write the header
                if z == 0:
                    o.write(
                        "IC50" + "\t" + "Ligand SMILES" + "\t" + "SMILES embedding" + "\t" + "Protein embedding" + "\n")

                # Write the info only when the IC50 and the SMILES code are valid
                else:
                    if splitline[9] != ("" and 0):
                        if ("<" not in splitline[9]) and (">" not in splitline[9]):
                            try:
                                m = Chem.MolFromSmiles(splitline[1])
                                smiles_embedding = sentences2vec(MolSentence(mol2alt_sentence(m, 1)), model,
                                                                 unseen='UNK')
                                o.write(str(splitline[9]) + "\t" + str(splitline[1]) + "\t" + str(
                                    smiles_embedding.tolist()) + "\t" + str(next(protein_embed)) + "\n")
                            except TypeError:
                                next(protein_embed)
def label(path, label_file, model, title):
    data = load_raw_data(path, [label_file])["test"]
    x = data.drop(columns=["smiles", "activity", 'mol'])
    process_model = word2vec.Word2Vec.load('model_300dim.pkl')
    data['sentence'] = data.apply(
        lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1)
    # Extracting embeddings to a numpy.array
    # Note that we always should mark unseen='UNK' in sentence2vec() so that model is taught how to handle unknown substructures
    data['mol2vec'] = [
        DfVec(x)
        for x in sentences2vec(data['sentence'], process_model, unseen='UNK')
    ]
    x_mol = np.array([x.vec for x in data['mol2vec']])
    x_mol = pd.DataFrame(x_mol)
    # Concatenating matrices of features
    x_test = pd.concat((x, x_mol), axis=1)
    x_test = StandardScaler().fit_transform(x_test)
    preds = model.predict_proba(x_test)[:, 1]
    write_data = data.drop(columns=["smiles"])
    # print(type(write_data))
    # print(write_data)
    write_data['activity'] = preds
Esempio n. 13
0
def get_fp(smiles):
    fp = []
    model = model = word2vec.Word2Vec.load(
        '/content/drive/My Drive/model_300dim.pkl')
    df = pd.DataFrame(columns=['SMILES'])
    processed_indices = []
    invalid_indices = []
    for i in range(len(smiles)):
        mol = smiles[i]
        tmp = np.array(mol2image(mol, n=2048))
        if np.isnan(tmp[0]):
            invalid_indices.append(i)
        else:
            fp.append(tmp)
            df = df.append({'SMILES': mol}, ignore_index=True)
            processed_indices.append(i)
    df['mol'] = df['SMILES'].apply(lambda x: Chem.MolFromSmiles(x))
    df['sentence'] = df.apply(
        lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1)
    df['mol2vec'] = [
        DfVec(x) for x in sentences2vec(df['sentence'], model, unseen='UNK')
    ]
    X = np.array([x.vec for x in df['mol2vec']])
    return X, processed_indices, invalid_indices
Esempio n. 14
0
# df['mol'] = df['mol'].apply(lambda x: Chem.AddHs(x))
mdf = df.drop(columns=['aff'])
y = df['aff'].values

from gensim.models import word2vec
from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec

model = word2vec.Word2Vec.load('./model_300dim.pkl')
#Constructing sentences
mdf['sentence'] = mdf.apply(
    lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1)

#Extracting embeddings to a numpy.array
#Note that we always should mark unseen='UNK' in sentence2vec() so that model is taught how to handle unknown substructures
mdf['mol2vec'] = [
    DfVec(x) for x in sentences2vec(mdf['sentence'], model, unseen='UNK')
]
x = np.array([x.vec for x in mdf['mol2vec']])

from sklearn import preprocessing
x_scaled = x

min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
print(x_scaled.shape)
# # train_df = pd.DataFrame(x_scaled)

# from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# mae_a = []
Esempio n. 15
0
from rdkit import Chem 

#Transforming SMILES to MOL
df1['mol'] = df1['SMILES sequence'].apply(lambda x: Chem.MolFromSmiles(x))

from google.colab import drive
drive.mount('/content/drive')

from gensim.models import word2vec
model = word2vec.Word2Vec.load('/content/drive/My Drive/model_300dim.pkl')

from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec
from gensim.models import word2vec
print('Molecular sentence:', mol2alt_sentence(df1['mol'][1], radius=1))
print('\nMolSentence object:', MolSentence(mol2alt_sentence(df1['mol'][1], radius=1)))
print('\nDfVec object:',DfVec(sentences2vec(MolSentence(mol2alt_sentence(df1['mol'][1], radius=1)), model, unseen='UNK')))

#Constructing sentences
df1['sentence']=df1.apply(lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1)

#Extracting embeddings to a numpy.array
#Note that we always should mark unseen='UNK' in sentence2vec() so that model is taught how to handle unknown substructures
df1['mol2vec'] = [DfVec(x) for x in sentences2vec(df1['sentence'], model, unseen='UNK')]
entire_train_data= np.array([x.vec for x in df1['mol2vec']])
entire_train_labels= labels.values

entire_train_data.shape

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
def create_features(data, types="train"):

    if types == "train":
        y = np.array(data['ACTIVE'].astype(int))
    elif types == "test":
        y = None

    data = data[["SMILES"]]
    data["SMILES_str"] = data["SMILES"]
    data["SMILES"] = data["SMILES"].apply(lambda x: Chem.MolFromSmiles(x))
    data["NumAtoms"] = data["SMILES"].apply(
        lambda x: x.GetNumAtoms())  #l.HeavyAtomCount(m)
    data["ExactMolWt"] = data["SMILES"].apply(lambda x: d.CalcExactMolWt(x))
    data["fr_Al_COO"] = data["SMILES"].apply(lambda x: f.fr_Al_COO(x))
    data["HsNumAtoms"] = data["SMILES"].apply(
        lambda x: Chem.AddHs(x).GetNumAtoms())
    #to have the hydrogens explicitly present

    BondType = [[str(x.GetBondType()) for x in m.GetBonds()]
                for m in data["SMILES"]]
    BondType = [" ".join(x) for x in BondType]

    vec = CountVectorizer().fit(BondType)
    train_tfidf = vec.transform(BondType).todense()  # 转化为更直观的一般矩阵
    vocabulary = vec.vocabulary_

    train_tfidf = pd.DataFrame(train_tfidf)
    train_tfidf.columns = vocabulary

    data = pd.concat([data, train_tfidf], axis=1)
    #data.columns
    #['SMILES', 'ACTIVE', 'SMILES_str', 'NumAtoms', 'ExactMolWt', 'fr_Al_COO','HsNumAtoms', 'double', 'single', 'aromatic', 'triple']
    traindata = data[[
        'NumAtoms', 'ExactMolWt', 'fr_Al_COO', 'HsNumAtoms', 'double',
        'single', 'aromatic', 'triple'
    ]]

    finger = [
        np.array(AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=512))
        for x in data["SMILES"]
    ]
    finger = pd.DataFrame(finger)
    finger.columns = ["morgan_" + str(x) for x in finger.columns]

    model = word2vec.Word2Vec.load('models/model_300dim.pkl')
    data['sentence'] = data.apply(
        lambda x: MolSentence(mol2alt_sentence(x['SMILES'], 1)), axis=1)
    m2v = [
        DfVec(x) for x in sentences2vec(data['sentence'], model, unseen='UNK')
    ]
    m2v = np.array([x.vec for x in m2v])
    m2v = pd.DataFrame(m2v)
    m2v.columns = ["m2v_" + str(x) for x in m2v.columns]

    datadict = {
        "Morgan": finger,
        "Despcritor": traindata,
        "molvec": m2v,
        'y': y
    }

    return datadict
Esempio n. 17
0
    mol = Chem.MolFromSmiles(smiles)
    sentence = mol2alt_sentence(mol, 1)
    return sentence


#===========================
beg = time.time()

#load smiles from smiles file
with open(args.inp, 'r') as f:
    smiles_list = f.readlines()

if args.descriptor == 'mol2vec':
    model = word2vec.Word2Vec.load(args.mol2vec_model_pkl)
    sentences = [smiles2sencence(x) for x in smiles_list]
    X = sentences2vec(sentences, model, unseen='UNK').astype(float32)
else:
    processor = simple_processors[args.descriptor]
    X = [processor(x) for x in smiles_list]

end = time.time()
print('processed array of %i compounds in %f seconds' % (len(X), end - beg))

X = np.nan_to_num(np.array(X).astype(np.float32))
mean = np.nan_to_num(X.mean(axis=0))
std = np.nan_to_num(X.std(axis=0))
non_zero_idx = np.where(std > 0)[0]

gzsave(args.output_core + '.npz', X)
gzsave(args.output_core + '_mu.npz', mean)
gzsave(args.output_core + '_std.npz', std)
Esempio n. 18
0
    # # step2 Handling of uncommon "words"
    # insert_unk(corpus=result_file_path2, out_corpus=result_file_path3)
    #
    # # step3 train molecule vector
    # train_word2vec_model(infile_name=result_file_path3, outfile_name=model_fp,
    #                      vector_size=100, window=10, min_count=3, n_jobs=4, method='cbow')

    # get vector of each molecule by mol2vec model
    # mol with fragment id sentence
    print('Start to read downsampled mol sentences and load model...')
    mol_info = pd.read_csv(dowmsampled_coupus_fp, header=None)

    # model_fp = os.path.join(include_small_dataset_dir, 'mol2vec_related', 'mol2vec_model.pkl')
    model = load_trained_model(model_fp)
    # print(mol_info.loc[4568802, '0'])
    mol_info['sentence'] = mol_info.apply(
        lambda x: MolSentence([str(i) for i in x[0].split(' ')]), axis=1)
    # print(mol_info)
    mol_info['mol2vec_related'] = [
        DfVec(x) for x in sentences2vec(mol_info['sentence'], model)
    ]
    cid2vec = {}
    cid2smiles = pd.read_csv(result_file_path1)
    inx2cid = cid2smiles['0'].to_dict()
    for inx in mol_info.index.to_list():
        cid = inx2cid[inx]
        cid2vec[cid] = list(mol_info.loc[inx, 'mol2vec_related'].vec)
    cid2vec_df = pd.DataFrame.from_dict(cid2vec, orient='index')
    print(cid2vec_df.shape)
    # result_file2 = os.path.join(result_dir, 'step4_selected_mol2vec_model_mol2vec.csv')
    cid2vec_df.to_csv(mol2vec_fp, header=False, float_format='%.3f')
Esempio n. 19
0
    d_mols={}
    l_num=1
    r_num=1
    for fname in ligands_folder:   
        if 'actives' in fname:
            receptor_name=fname.split('-actives')[0].split('/')[-1]   
            label=1           
        elif 'decoys' in fname:
            receptor_name=fname.split('-decoys')[0].split('/')[-1]
            label=0            
        if receptor_name+'_'+str(label) not in d_mols.keys():
            d_mols[receptor_name+'_'+str(label)]=[]
            
        df = PandasTools.LoadSDF(fname)
        df['sentence'] = df.apply(lambda x: MolSentence(mol2alt_sentence(x['ROMol'], 1)), axis=1)
        df['mol2vec'] = [DfVec(x) for x in sentences2vec(df['sentence'], model, unseen='UNK')]
        X = np.array([x.vec for x in df['mol2vec']])
        d_mols[receptor_name+'_'+str(label)]=X

        print(str(l_num), " th receptor")
        l_num = l_num+1

    save_obj(d_mols, directory + 'train_test_data/'+date_str+'/ligand_dict_mols')
else:
    ligand_dict=load_obj(savepath+'/ligand_dict_mols')

#####################################################
#Data
#####################################################             
if generate_images:
    receptors = sorted(glob.glob(directory + 'pockets_dude_tiff_128/'+date_str+'/*.png'))
Esempio n. 20
0
while ind < len(drugStructure):
    smiles = drugStructure['SMILES'].iloc[0]
    mol = Chem.MolFromSmiles(smiles)
    if type(mol) is Chem.rdchem.Mol:
        lst.append(mol)
    else:
        lst.append(float('Nan'))
    ind += 1
    print(ind)
drugStructure['mol'] = lst
drugStructure['noSmiles'] = drugStructure['mol'].isnull()
drugStructure = drugStructure[drugStructure['noSmiles'] == False]
drugStructure['mol-sentences'] = drugStructure.apply(
    lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1)
drugStructure['mol2vec'] = [
    x for x in sentences2vec(
        drugStructure['mol-sentences'], modelMol2Vec, unseen='UNK')
]


def translateToVec(val):
    #if type(val)==type(1):
    #    return [0]*300
    #else:
    #    return val
    return [0.5] * 300


nodecuisnodes = list(fullGraph.nodes())
featureDF = pn.DataFrame()
featureDF['Name'] = nodecuisnodes
featureDF = pn.merge(featureDF,
Esempio n. 21
0
mdf = pd.read_csv('Lipophilicity_df_revised.csv')
target = mdf['exp']

mdf.drop(columns='exp', inplace=True)
mdf['mol'] = mdf['smiles'].apply(lambda x: Chem.MolFromSmiles(x))
#Loading pre-trained model via word2vec
from gensim.models import word2vec
model = word2vec.Word2Vec.load('model_300dim.pkl')

mols = MolSentence(mol2alt_sentence(mdf['mol'][1], radius=1))
keys = set(model.wv.vocab.keys())
mnk = set(mols) & keys

s2v = sentences2vec(MolSentence(mol2alt_sentence(mdf['mol'][1], radius=1)),
                    model,
                    unseen='UNK')

mdf['sentence'] = mdf.apply(
    lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1)
mdf['mol2vec'] = [
    DfVec(x) for x in sentences2vec(mdf['sentence'], model, unseen='UNK')
]

X = np.array([x.vec for x in mdf['mol2vec']])
X.shape
y = target.values
y.shape

#For the full training set using the substructure of vectors
from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec
Esempio n. 22
0
fullRepoDB = fullRepoDB.groupby('drug_id', as_index=False).first()
fullRepoDB = pn.merge(fullRepoDB,
                      drugStructure[['drug_id', 'SMILES']],
                      on='drug_id',
                      how='inner')
fullRepoDB['noSmiles'] = fullRepoDB['SMILES'].isnull()
fullRepoDB = fullRepoDB[fullRepoDB['noSmiles'] == False]
#now we get mol2vec vector for each drug

modelMol2Vec = word2vec.Word2Vec.load(
    '/home/galiasn/DATA/MechanismBasedRepurposing/Data/model_300dim.pkl')

#aa_smis = fullRepoDB['mol']
#count =0
#for f in aa_smis:
#    MolSentence(mol2alt_sentence(f, 1))
#    count+=1
#    print(count)
#aas = [Chem.MolFromSmiles(x) for x in aa_smis]

fullRepoDB['mol'] = fullRepoDB['SMILES'].apply(Chem.MolFromSmiles)
fullRepoDB['noSmiles'] = fullRepoDB['mol'].isnull()
fullRepoDB = fullRepoDB[fullRepoDB['noSmiles'] == False]
fullRepoDB['mol-sentences'] = fullRepoDB.apply(
    lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1)
fullRepoDB['mol2vec'] = [
    x for x in sentences2vec(
        fullRepoDB['mol-sentences'], modelMol2Vec, unseen='UNK')
]

mol2vecNames = fullRepoDB['drug_name'].unique()
Esempio n. 23
0
    for fname in ligands_folder:
        if 'actives' in fname:
            receptor_name = fname.split('-actives')[0].split('/')[-1]
            label = 1
        elif 'decoys' in fname:
            receptor_name = fname.split('-decoys')[0].split('/')[-1]
            label = 0
        if receptor_name + '_' + str(label) not in d_mols.keys():
            d_mols[receptor_name + '_' + str(label)] = []

        df = PandasTools.LoadSDF(fname)
        df['sentence'] = df.apply(
            lambda x: MolSentence(mol2alt_sentence(x['ROMol'], 1)), axis=1)
        df['mol2vec'] = [
            DfVec(x)
            for x in sentences2vec(df['sentence'], model, unseen='UNK')
        ]
        X = np.array([x.vec for x in df['mol2vec']])
        d_mols[receptor_name + '_' + str(label)] = X

        print(str(l_num), " th receptor")
        l_num = l_num + 1

    save_obj(d_mols,
             directory + 'train_test_data/' + date_str + '/ligand_dict_mols')
else:
    ligand_dict = load_obj(savepath + '/ligand_dict_mols')

#####################################################
#Data
#####################################################
Esempio n. 24
0
    from keras import metrics, optimizers
    from keras.callbacks.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
    from sklearn.metrics import r2_score, mean_absolute_error, median_absolute_error

    dataset = pd.read_csv('Data/data.csv')
    train_set, test_set = split_dataset(dataset, 0.1)

    model = word2vec.Word2Vec.load('Mol2Vec/pretrain/model_300dim.pkl')
    train_mol = [Chem.MolFromSmiles(x) for x in train_set['SMILES']]
    test_mol = [Chem.MolFromSmiles(x) for x in test_set['SMILES']]

    train_sent = [mol2alt_sentence(x, 1) for x in train_mol]
    test_sent = [mol2alt_sentence(x, 1) for x in test_mol]

    train_vec = [
        DfVec(x).vec for x in sentences2vec(train_sent, model, unseen='UNK')
    ]
    test_vec = [
        DfVec(x).vec for x in sentences2vec(test_sent, model, unseen='UNK')
    ]

    train_vec = np.array(train_vec)
    test_vec = np.array(test_vec)

    # train model
    layer_in = Input(shape=(train_vec.shape[1], ))
    layer_dense = layer_in
    n_nodes = 32
    for j in range(3):
        layer_dense = Dense(int(n_nodes), activation="relu")(layer_dense)
    layer_output = Dense(1, activation="linear")(layer_dense)