Beispiel #1
0
def cache_all_logreg_weights():
    dsets = MANYSOURCES_MOLECULES.keys()
    feats_models = (('logreg1', 'ecfps1'),
                    ('logreg3', 'ecfps1'))
    for dset, (model, feats) in product(dsets, feats_models):
        logreg_weights(dset=dset, model=model, feats=feats, lso=False)
        logreg_weights(dset=dset, model=model, feats=feats, lso=False)
Beispiel #2
0
def cache_all_scores(calib=None, lso=True):
    dsets = MANYSOURCES_MOLECULES.keys()
    feats_models = (('logreg1', 'ecfps1'),
                    ('logreg3', 'ecfps1'),
                    ('rfc1', 'rdkdescs1'))
    for dset, (model, feats) in product(dsets, feats_models):
        merge_cache_scores(dset_id=dset, model=model, feats=feats, calib=calib, lso=lso)
Beispiel #3
0
def cache_all_molecules_coocurrences():
    dsets = MANYSOURCES_MOLECULES.keys()
    feats_models = (('logreg1', 'ecfps1'),
                    ('logreg3', 'ecfps1'),  # These two are the same, just put a link in the HDF5...
                    ('rfc1', 'rdkdescs1'))
    for dset, (model, feats) in product(dsets, feats_models):
        molecules_coocurrences_df(dset=dset, model=model, feats=feats, lso=True)
        molecules_coocurrences_df(dset=dset, model=model, feats=feats, lso=False)
Beispiel #4
0
def lsocvs_cls(step=32,
               total=4096,
               destinies=(('galileo', 46),
                          ('zeus',    32),
                          ('str22',   10),
                          ('strz',    22)),
               batch_name='logregs1',
               root=op.abspath(op.join(op.dirname(__file__), '..')),
               feats_models=(('ecfps1', 'logreg1'),
                             ('ecfps1', 'logreg3'),
                             ('ecfps1', 'logreg5'),
                             ('ecfps1', 'logreg7'),
                             ('rdkdescs1', 'rfc1'))):
    MANYSOURCES_LOGS_DIR = op.join('~', 'manysources-logs', batch_name)
    commands = []
    for dset, (feats, model) in product(MANYSOURCES_MOLECULES.keys(), feats_models):
        for start in xrange(step):
            expids = range(start, total, step)
            logfile = op.join(MANYSOURCES_LOGS_DIR,
                              '%s-%s-%s-%d_%d_%d.log' % (dset, feats, model, step, start, total))
            commands.append(
                'PYTHONPATH=.:$PYTHONPATH python2 -u manysources/experiments.py generate-lsocv-results '
                '--dset %s --feats %s --model %s --expids %s '
                '&>%s' %
                (dset, feats, model, ' '.join(map(str, expids)), logfile))

    # --- Save the cls to files

    # Remove duplicates, randomize
    commands = list(set(commands))

    # Proper balance of workloads between machines
    destinies = [(name, ['mkdir -p %s' % MANYSOURCES_LOGS_DIR], prob) for name, prob in destinies]
    p_choice = np.array([p for _, _, p in destinies], dtype=np.float)
    p_choice /= float(np.sum(p_choice))
    rng = np.random.RandomState(2147483647)
    for cl in commands:
        _, your_destiny, _ = destinies[rng.choice(len(destinies), p=p_choice)]
        your_destiny.append(cl)

    # Save the selections
    for name, cls, _ in destinies:
        with open(op.join(root, name + '-' + batch_name), 'w') as writer:
            writer.write('\n'.join(cls))

    # ----- Summary
    total_cls = sum(len(cl) for _, cl, _ in destinies)
    print 'Total number of commands: %d' % total_cls
    for name, cls, p in destinies:
        print '\t%s\t%d %g %g' % (name.ljust(30), len(cls), p, len(cls) / (total_cls + 1.))
def cache_all_fold_sizes_aucs(drop_na=True):
    cache_file = op.join(MANYSOURCES_DATA_ROOT, 'results', 'merged_fold_sizes_aucs_bigdf.pickled')
    if not op.isfile(cache_file):
        dsets = MANYSOURCES_MOLECULES.keys()
        feats_models = (('logreg1', 'ecfps1'),
                        ('logreg3', 'ecfps1'),
                        ('rfc1', 'rdkdescs1'))
        expids = range(4096)
        dfs = []
        for dset, (model, feats), expid, lso in product(dsets, feats_models, expids, (True, False)):
            dfs.append(merge_cache_sizes_aucs(dset=dset, model=model, feats=feats, expid=expid, lso=lso))
        big_df = pd.concat(dfs, ignore_index=True).reset_index(drop=True)
        big_df.set_index(['dset', 'feats', 'model', 'expid', 'lso', 'fold'])
        big_df.to_pickle(cache_file,)
    df = pd.read_pickle(cache_file)
    if drop_na:
        return df.dropna(axis=0)
    return df
Beispiel #6
0
def check_regenerate_results(regenerate=False):
    for expid, dset, feat, model in product(range(4096),
                                            MANYSOURCES_MOLECULES.keys(),
                                            MANYSOURCES_FEATS.keys(),
                                            MANYSOURCES_MODELS.keys()):
        h5_file = single_hdf5_per_exp(expid=expid, dset=dset, feats=feat, model=model)
        if not op.isfile(h5_file):
            print 'Missing %s ' % h5_file
            if regenerate:
                try:
                    generate_lsocv_results(dset=dset, model=model, feats=feat, expids=(expid,), reraise=True)
                except:
                    print format_exc()
                    pass
                if not op.isfile(h5_file):
                    print '\tRegeneration failed'
                else:
                    print '\tRegeneration successful'
Beispiel #7
0
            self._cols.extend(map(itemgetter(0), cols_vals))
            self._vals.extend(map(itemgetter(1), cols_vals))
            self._molids.append(molid)
        except Exception, _:
            warning('Could not compute unfolded fingerprint for molecule %s' % molid)
            self._failed_moldids.append(molid)

    def fingerprints(self):
        i2s = [smiles for smiles, _ in sorted(self._s2i.items(), key=itemgetter(1))]
        csr = coo_matrix((self._vals, (self._rows, self._cols)),
                         shape=(len(self._molids), len(self._s2i))).tocsr()
        return UnfoldedFingerprints(self._molids, i2s, csr, failed_molids=self._failed_moldids)

if __name__ == '__main__':
    from manysources.datasets import ManysourcesDataset, MANYSOURCES_MOLECULES
    for dset in MANYSOURCES_MOLECULES.keys():
        print dset
        dset = ManysourcesDataset(dset)
        ufp = dset.ecfps()
        molids, X, Y = dset.ecfps_molidsXY()
        # This should find "duplicate" features
        dupe_columns = find_sparse_dupes(X, by_rows=False)
        for group in dupe_columns:
            if len(group) > 1:
                print 'Duplicated features: %s' % ' '.join(map(str, group))
                print '\t%s' % ' '.join(ufp.substructures(list(group)))
        # This should remove "duplicate" features
        # nnz_before = X.nnz
        # X = zero_dupes(X, by_rows=False)
        # print 'Before: %d; After: %d' % (nnz_before, X.nnz)
        # This should find duplicate rows