def cache_all_logreg_weights(): dsets = MANYSOURCES_MOLECULES.keys() feats_models = (('logreg1', 'ecfps1'), ('logreg3', 'ecfps1')) for dset, (model, feats) in product(dsets, feats_models): logreg_weights(dset=dset, model=model, feats=feats, lso=False) logreg_weights(dset=dset, model=model, feats=feats, lso=False)
def cache_all_scores(calib=None, lso=True): dsets = MANYSOURCES_MOLECULES.keys() feats_models = (('logreg1', 'ecfps1'), ('logreg3', 'ecfps1'), ('rfc1', 'rdkdescs1')) for dset, (model, feats) in product(dsets, feats_models): merge_cache_scores(dset_id=dset, model=model, feats=feats, calib=calib, lso=lso)
def cache_all_molecules_coocurrences(): dsets = MANYSOURCES_MOLECULES.keys() feats_models = (('logreg1', 'ecfps1'), ('logreg3', 'ecfps1'), # These two are the same, just put a link in the HDF5... ('rfc1', 'rdkdescs1')) for dset, (model, feats) in product(dsets, feats_models): molecules_coocurrences_df(dset=dset, model=model, feats=feats, lso=True) molecules_coocurrences_df(dset=dset, model=model, feats=feats, lso=False)
def lsocvs_cls(step=32, total=4096, destinies=(('galileo', 46), ('zeus', 32), ('str22', 10), ('strz', 22)), batch_name='logregs1', root=op.abspath(op.join(op.dirname(__file__), '..')), feats_models=(('ecfps1', 'logreg1'), ('ecfps1', 'logreg3'), ('ecfps1', 'logreg5'), ('ecfps1', 'logreg7'), ('rdkdescs1', 'rfc1'))): MANYSOURCES_LOGS_DIR = op.join('~', 'manysources-logs', batch_name) commands = [] for dset, (feats, model) in product(MANYSOURCES_MOLECULES.keys(), feats_models): for start in xrange(step): expids = range(start, total, step) logfile = op.join(MANYSOURCES_LOGS_DIR, '%s-%s-%s-%d_%d_%d.log' % (dset, feats, model, step, start, total)) commands.append( 'PYTHONPATH=.:$PYTHONPATH python2 -u manysources/experiments.py generate-lsocv-results ' '--dset %s --feats %s --model %s --expids %s ' '&>%s' % (dset, feats, model, ' '.join(map(str, expids)), logfile)) # --- Save the cls to files # Remove duplicates, randomize commands = list(set(commands)) # Proper balance of workloads between machines destinies = [(name, ['mkdir -p %s' % MANYSOURCES_LOGS_DIR], prob) for name, prob in destinies] p_choice = np.array([p for _, _, p in destinies], dtype=np.float) p_choice /= float(np.sum(p_choice)) rng = np.random.RandomState(2147483647) for cl in commands: _, your_destiny, _ = destinies[rng.choice(len(destinies), p=p_choice)] your_destiny.append(cl) # Save the selections for name, cls, _ in destinies: with open(op.join(root, name + '-' + batch_name), 'w') as writer: writer.write('\n'.join(cls)) # ----- Summary total_cls = sum(len(cl) for _, cl, _ in destinies) print 'Total number of commands: %d' % total_cls for name, cls, p in destinies: print '\t%s\t%d %g %g' % (name.ljust(30), len(cls), p, len(cls) / (total_cls + 1.))
def cache_all_fold_sizes_aucs(drop_na=True): cache_file = op.join(MANYSOURCES_DATA_ROOT, 'results', 'merged_fold_sizes_aucs_bigdf.pickled') if not op.isfile(cache_file): dsets = MANYSOURCES_MOLECULES.keys() feats_models = (('logreg1', 'ecfps1'), ('logreg3', 'ecfps1'), ('rfc1', 'rdkdescs1')) expids = range(4096) dfs = [] for dset, (model, feats), expid, lso in product(dsets, feats_models, expids, (True, False)): dfs.append(merge_cache_sizes_aucs(dset=dset, model=model, feats=feats, expid=expid, lso=lso)) big_df = pd.concat(dfs, ignore_index=True).reset_index(drop=True) big_df.set_index(['dset', 'feats', 'model', 'expid', 'lso', 'fold']) big_df.to_pickle(cache_file,) df = pd.read_pickle(cache_file) if drop_na: return df.dropna(axis=0) return df
def check_regenerate_results(regenerate=False): for expid, dset, feat, model in product(range(4096), MANYSOURCES_MOLECULES.keys(), MANYSOURCES_FEATS.keys(), MANYSOURCES_MODELS.keys()): h5_file = single_hdf5_per_exp(expid=expid, dset=dset, feats=feat, model=model) if not op.isfile(h5_file): print 'Missing %s ' % h5_file if regenerate: try: generate_lsocv_results(dset=dset, model=model, feats=feat, expids=(expid,), reraise=True) except: print format_exc() pass if not op.isfile(h5_file): print '\tRegeneration failed' else: print '\tRegeneration successful'
self._cols.extend(map(itemgetter(0), cols_vals)) self._vals.extend(map(itemgetter(1), cols_vals)) self._molids.append(molid) except Exception, _: warning('Could not compute unfolded fingerprint for molecule %s' % molid) self._failed_moldids.append(molid) def fingerprints(self): i2s = [smiles for smiles, _ in sorted(self._s2i.items(), key=itemgetter(1))] csr = coo_matrix((self._vals, (self._rows, self._cols)), shape=(len(self._molids), len(self._s2i))).tocsr() return UnfoldedFingerprints(self._molids, i2s, csr, failed_molids=self._failed_moldids) if __name__ == '__main__': from manysources.datasets import ManysourcesDataset, MANYSOURCES_MOLECULES for dset in MANYSOURCES_MOLECULES.keys(): print dset dset = ManysourcesDataset(dset) ufp = dset.ecfps() molids, X, Y = dset.ecfps_molidsXY() # This should find "duplicate" features dupe_columns = find_sparse_dupes(X, by_rows=False) for group in dupe_columns: if len(group) > 1: print 'Duplicated features: %s' % ' '.join(map(str, group)) print '\t%s' % ' '.join(ufp.substructures(list(group))) # This should remove "duplicate" features # nnz_before = X.nnz # X = zero_dupes(X, by_rows=False) # print 'Before: %d; After: %d' % (nnz_before, X.nnz) # This should find duplicate rows