def from_feat_back_to_mols(dset, smi): """ Retrieves the list of molecules that contain the given feature in the given dataset. THIS IS EXTREMELY SLOW!!!!!!! """ mols = [] molids = [] indices = [] classes = [] mfm = MalariaFingerprintsManager(dset=dset) print('I am here') with open(mfm.original_file, 'r') as reader: for i, line in enumerate(reader): info = line.split('\t') molid = info[0] ecfps = [feat.split()[0] for feat in info[1:]] if smi in ecfps: print(molid) molids.append(molid) indices.append(i) mc = MalariaCatalog() classes.append(mc.label(molid, as01=True)) mols.append(None) print('Got the first lists') molids = np.array(molids) indices = np.array(indices) classes = np.array(classes) mols = np.array(mols) # Now we need to retrieve the real rdkit mols for each molid. if dset == 'lab': for molid, _, _, _, _, smiles in read_labelled_smiles(): if molid in molids: mols[molids == molid] = smiles return zip(mols, molids, indices, classes)
def from_feat_back_to_mols_faster(dset, smi): """ Retrieves the list of molecules that contain the given feature in the given dataset. """ # The non-folded version is easy mfm = MalariaFingerprintsManager(dset=dset) X = mfm.X() col = mfm.s2i(smi) # the column where we have to look for in the X matrix cooX = X.tocoo() indices_mols = cooX.row[cooX.col == col] molids = [mfm.i2m(i) for i in indices_mols] mc = MalariaCatalog() activities = [mc.label(molid, as01=True) for molid in molids] mols = mc.molids2mols(molids) return zip(mols, molids, indices_mols, activities)
def mispredicted_compounds(folding_size=None): """ At each fold, collect the list of mispredicted compounds and assemble it into one list of molids """ FOLDS = range(10) mfm = MalariaFingerprintsManager(dset='lab') mispredicted = [] if folding_size is None: path = op.join(MALARIA_EXPS_ROOT, 'folding_rdkit', 'no_folding') else: path = op.join(MALARIA_EXPS_ROOT, 'folding_rdkit', 'fs=%i' % folding_size) for fold in FOLDS: with open(op.join(path, 'fold=%i' % fold, 'results.pkl'), 'r') as reader: _, scores, fold, _, _, _ = pickle.load(reader) scores = scores >= 0.5 # dummy threshold molids_test = [mfm.i2m(i) for i in fold] mc = MalariaCatalog() classes_test = [mc.label(molid, as01=True) for molid in molids_test] for i, mol in enumerate(molids_test): if scores[i] != classes_test[i] and not np.isnan(classes_test[i]): mispredicted.append(mol) return mispredicted