def logreg_molids(dset='lab'): # No need to do this on a per-result basis because # atm we are warranted that they are the same accross all evaluations. rf_lab, rf_amb, rf_unl, rf_scr = malaria_logreg_fpt_providers(None) rf = (rf_lab if dset == 'lab' else rf_amb if dset == 'amb' else rf_unl if dset == 'unl' else rf_scr if dset == 'scr' else None) if rf is None: raise Exception('Unknown dataset %s' % dset) return rf.ids()
def logreg_results_to_pandas(common_molids_cache=False): """Collects all the results in disk and place them in record-format in a pandas dataframe. Allows convenient reporting, grouping and filtering of results. """ results = ResultInDisk.collect_results_under_dir(MALARIA_LOGREGS_EXPERIMENT_ROOT, factory=malaria_result_factory) # --- molids cache molids_cache = None if common_molids_cache: rf_lab, rf_amb, rf_unl, rf_scr = malaria_logreg_fpt_providers(None) # Labelled molids lab_molids = rf_lab.ids() amb_molids = rf_amb.ids() # To prioritize confirmatory tests on labelled data # Unlabelled molids unl_molids = rf_unl.ids() scr_molids = rf_scr.ids() # Let's avoid the need to reread them... molids_cache = { 'lab': lab_molids, 'amb': amb_molids, 'unl': unl_molids, 'scr': scr_molids } results_dict_of_dicts = {} for result in results: if common_molids_cache: result.ids_cache = molids_cache # dodgy, rework with a copying constructor rdict = copy(result.info()) rdict['result'] = result rdict['class_weight'] = 'uniform' if rdict['class_weight'] is None else rdict['class_weight'] # Some more ad-hoc keys for the model rdict['num_present_folds'] = result.num_present_folds() rdict['auc_mean'] = result.auc_mean() rdict['enrichement5_mean'] = result.enrichement5_mean() # Some more ad-hoc keys for the fingerprint folder folder = result.fingerprint_folder() rdict['folder_seed'] = int(folder.seed) if folder is not None else -1 rdict['folder_size'] = int(folder.fold_size) if folder is not None else 0 # Add this result to the data frame results_dict_of_dicts[result.root_key()] = rdict return DataFrame(results_dict_of_dicts).T
def trie_bench(): # # Needs to be properly done, but we already know: # # - In terms of disk space, no option beats vanilla gzipping # on plain text data. # # - Marisa trie can be useful to avoid memory problems: it can memmap # making multiprocessing easy, it reduces dramatically the # size. Speed is also fastest for this dataset (informal benchmarks # not in here). # # - The smallest memory footprint is given by plain marisa. # But that is only suitable if we could let marisa decide the mapping # to the column index and if we do not mind these indices changing # each time we update the feature collection. We could always keep # an extra int array with the actual index of the column and update it # when we need to rewrite the trie upon new features arrival. # # - Most probably, python dict is way faster on remapping - but that is also # probably irrelevant here. We need to measure how much space these consume # and how other alternatives (e.g. sparsepp) fare. # rf_lab, rf_amb, rf_unl, rf_scr = malaria_logreg_fpt_providers(None) def subs(n=None, return_index='simple'): substructures = rf_lab.mfm().substructures() if n is None: n = len(substructures) for i in tqdm(range(min(n, len(substructures)))): if return_index == 'simple': yield str(substructures[i]), i elif return_index == 'tuple': yield str(substructures[i]), (i,) elif return_index == 'no': yield str(substructures[i]) else: raise ValueError('return_index must be one of ["simple", "tuple", "no"], it is %r' % return_index) # Uncompressed: 11697K. Can mmap. # Mapping to index becomes arbitrary, so we either need to define a format # with an auxiliary mapping (int by marisa -> int by insertion) and keep it # constant with new additions or we just not use this. # Need to measure speed. trie = marisa_trie.Trie() trie.load(op.expanduser('~/substructures.marisa')) for k, v in trie.iteritems(): print(k, v) trie = marisa_trie.Trie(subs(return_index='no')) trie.save(op.expanduser('~/substructures.marisa')) fmt = 'I' trie = marisa_trie.RecordTrie(fmt, subs(return_index='tuple')) trie.save(op.expanduser('~/substructures.intMarisa')) trie = dawg.DAWG(subs(return_index='simple')) trie.save(op.expanduser('~/substructures.dawg')) trie = dawg.IntCompletionDAWG(subs(return_index='simple')) trie.save(op.expanduser('~/substructures.intCompletionDawg')) trie = dawg.IntDAWG(subs(return_index='simple')) trie.save(op.expanduser('~/substructures.intDawg')) trie = datrie.Trie(string.printable) for s, i in subs(return_index='simple'): trie[s] = i trie.save(op.expanduser('~/substructures.datrie')) trie = datrie.BaseTrie(string.printable) for s, i in subs(return_index='simple'): trie[s] = i trie.save(op.expanduser('~/substructures.basedatrie'))
def logreg_deploy(dest_file=None, with_bug=False): """ Generates predictions for the competition unlabelled datasets, saving them in HDF5 files. Generates one prediction per molecule and cross-validation experiment: - For the labelled set, the prediction is given by the model of the run where the molecule was in the testing set. - For the other sets, the predictions are averages of all the models built during cross-validation. Note that at the time of submitting there was a bug that made these predictions be just the one of the last fold (see `with_bug` parameter). Parameters ---------- dest_file : string or None, default None Path to the HDF5 to store the prediction values. There will be as many groups in there as deployed models. Each group will contain 4 datasets: - lab: predicitions on the labelled dataset - amb: predictions on the ambiguously labelled compounds - unl: predictions in the held-out competition set - scr: predictions in the screening dataset with_bug : bool, default False If True, predictions will be generated as for the competion (taking only the last fold of each experiment into account). If False, predictions will be generated as initially intended (averaging all the folds for each experiment). This bug does not affect the labelled scores. Returns ------- The path to the HDF5 file where the scores have been saved. Side effects ------------ The HDF5 file is created """ if dest_file is None: dest_file = malaria_logreg_deployers_file(with_bug=with_bug) results = logreg_experiments_to_deploy().result info('Deploying %d logistic regression experiments (%d classifiers)' % ( len(results), sum(len(result.present_folds()) for result in results))) # We will have a few "features" for each deployer # For lab it will just be the test scores # For amb, unl and scr it will be the average of the scores for each cv fold rf_lab, rf_amb, rf_unl, rf_scr = malaria_logreg_fpt_providers(None) with h5py.File(dest_file, 'w') as h5: for i, res in enumerate(results): # Deployer id f_name = '%s__%s' % (res.model_setup_id(), res.eval_setup_id()) # Lab if '%s/lab' % f_name not in h5: h5['%s/lab' % f_name] = res.scores()[:, 1].astype(np.float32) # Get result models models = [res.fold_model(fold, with_bug=with_bug) for fold in res.present_folds()] # Amb if '%s/amb' % f_name not in h5: h5['%s/amb' % f_name] = np.nanmean([model.predict_proba(rf_amb.X())[:, 1] for model in models], axis=0).astype(np.float32) # Unl if '%s/unl' % f_name not in h5: h5['%s/unl' % f_name] = np.nanmean([model.predict_proba(rf_unl.X())[:, 1] for model in models], axis=0).astype(np.float32) # Scr if '%s/scr' % f_name not in h5: h5['%s/scr' % f_name] = np.nanmean([model.predict_proba(rf_scr.X())[:, 1] for model in models], axis=0).astype(np.float32) return dest_file
def logreg_deploy(dest_file=MALARIA_LOGREGS_DEPLOYMENT_H5): """Generates predictions for unlabelled datasets.""" df = logreg_results_to_pandas() h5 = h5py.File(dest_file, 'w') # Choose a few good results (maybe apply diversity filters or ensemble selection or...) deployment_cond_1 = (df.cv_seed < 5) & \ (df.num_present_folds == df.num_cv_folds) & \ (df.penalty == 'l1') & \ (df.C == 1) & \ (df.class_weight == 'auto') & \ (df.tol == 1E-4) & \ (df.folder_size < 1) & \ (df.folder_seed == -1) & \ (df.auc_mean > 0.92) deployment_cond_2 = (df.num_present_folds == df.num_cv_folds) & \ (df.penalty == 'l2') & \ (df.C == 5) & \ (df.class_weight == 'auto') & \ (df.tol == 1E-4) & \ (df.folder_size < 1) & \ (df.folder_seed == -1) & \ (df.auc_mean > 0.93) deployers = df[deployment_cond_1 | deployment_cond_2] info('Deploying %d logistic regressors' % len(deployers)) # We will have 40 "features", one for each deployer # For lab it will just be the test scores # For amb, unl and scr it will be the average of the scores for each cv fold rf_lab, rf_amb, rf_unl, rf_scr = malaria_logreg_fpt_providers(None) for i, res in enumerate(deployers.result): f_name = '%s__%s' % (res.model_setup_id(), res.eval_setup_id()) # What about the data setup? # Here it works but in general not # Save it all... # (a new dataset with all the coords # and the result path) print f_name # Lab if '%s/lab' % f_name not in h5: h5['%s/lab' % f_name] = res.scores()[:, 1].astype(np.float32) # Amb models = [res.fold_model(fold) for fold in res.present_folds()] if '%s/amb' % f_name not in h5: h5['%s/amb' % f_name] = np.nanmean([model.predict_proba(rf_amb.X())[:, 1] for model in models], axis=0).astype(np.float32) # Unl if '%s/unl' % f_name not in h5: h5['%s/unl' % f_name] = np.nanmean([model.predict_proba(rf_unl.X())[:, 1] for model in models], axis=0).astype(np.float32) # Scr if '%s/scr' % f_name not in h5: h5['%s/scr' % f_name] = np.nanmean([model.predict_proba(rf_scr.X())[:, 1] for model in models], axis=0).astype(np.float32) h5.close()