def from_feat_back_to_mols(dset, smi):
    """
    Retrieves the list of molecules that contain the given feature in the given dataset. THIS IS EXTREMELY SLOW!!!!!!!
    """
    mols = []
    molids = []
    indices = []
    classes = []
    mfm = MalariaFingerprintsManager(dset=dset)
    print('I am here')
    with open(mfm.original_file, 'r') as reader:
        for i, line in enumerate(reader):
            info = line.split('\t')
            molid = info[0]
            ecfps = [feat.split()[0] for feat in info[1:]]
            if smi in ecfps:
                print(molid)
                molids.append(molid)
                indices.append(i)
                mc = MalariaCatalog()
                classes.append(mc.label(molid, as01=True))
                mols.append(None)
    print('Got the first lists')
    molids = np.array(molids)
    indices = np.array(indices)
    classes = np.array(classes)
    mols = np.array(mols)
    # Now we need to retrieve the real rdkit mols for each molid.
    if dset == 'lab':
        for molid, _, _, _, _, smiles in read_labelled_smiles():
            if molid in molids:
                mols[molids == molid] = smiles
    return zip(mols, molids, indices, classes)
def from_feat_back_to_mols_faster(dset, smi):
    """
    Retrieves the list of molecules that contain the given feature in the given dataset.
    """
    # The non-folded version is easy
    mfm = MalariaFingerprintsManager(dset=dset)
    X = mfm.X()
    col = mfm.s2i(smi)  # the column where we have to look for in the X matrix
    cooX = X.tocoo()
    indices_mols = cooX.row[cooX.col == col]
    molids = [mfm.i2m(i) for i in indices_mols]
    mc = MalariaCatalog()
    activities = [mc.label(molid, as01=True) for molid in molids]
    mols = mc.molids2mols(molids)
    return zip(mols, molids, indices_mols, activities)
def mispredicted_compounds(folding_size=None):
    """
    At each fold, collect the list of mispredicted compounds and assemble it into one list of molids
    """
    FOLDS = range(10)
    mfm = MalariaFingerprintsManager(dset='lab')
    mispredicted = []
    if folding_size is None:
        path = op.join(MALARIA_EXPS_ROOT, 'folding_rdkit', 'no_folding')
    else:
        path = op.join(MALARIA_EXPS_ROOT, 'folding_rdkit', 'fs=%i' % folding_size)
    for fold in FOLDS:
        with open(op.join(path, 'fold=%i' % fold, 'results.pkl'), 'r') as reader:
            _, scores, fold, _, _, _ = pickle.load(reader)
            scores = scores >= 0.5    # dummy threshold
            molids_test = [mfm.i2m(i) for i in fold]
            mc = MalariaCatalog()
            classes_test = [mc.label(molid, as01=True) for molid in molids_test]
            for i, mol in enumerate(molids_test):
                if scores[i] != classes_test[i] and not np.isnan(classes_test[i]):
                    mispredicted.append(mol)
    return mispredicted