def all_subs(dset): info(dset) subs = set() with open(op.join(_MALARIA_ECFPS_DIR, dset + '.merged')) as reader: for line in reader: subs.update(sub.split()[0] for sub in line.split('\t')[1:]) # TODO sort by frequency return subs
def process(molid, smiles): if molid is _END_MOLID: writer.close() return try: mol = to_rdkit_mol(smiles) fpsinfo = {} # N.B. We won't actually use rdkit hash, so we won't ask for nonzero values... # Is there a way of asking rdkit to give us this directly? AllChem.GetMorganFingerprint(mol, max_radius, bitInfo=fpsinfo, useFeatures=fcfp) counts = defaultdict(int) centers = defaultdict(list) for bit_descs in fpsinfo.values(): for center, radius in bit_descs: cansmiles = explain_circular_substructure(mol, center, radius) counts[cansmiles] += 1 centers[cansmiles].append((center, radius)) if write_centers: features_strings = ['%s %d %s' % (cansmiles, count, ' '.join(['%d %d' % (c, r) for c, r in centers[cansmiles]])) for cansmiles, count in counts.iteritems()] else: features_strings = ['%s %d' % (cansmiles, count) for cansmiles, count in counts.iteritems()] writer.write('%s\t%s\n' % (molid, '\t'.join(features_strings))) except: info('Failed molecule %s: %s' % (molid, smiles)) writer.write('%s\t*FAILED*\n' % molid)
def _molidsmiles_it_ecfp(output_file, start=0, step=46, fcfp=True, logeach=5000): """Q&D variant to allow Parallel work (cannot pickle closures or reuse iterators...).""" processor = _ecfp_writer(output_file=output_file, fcfp=fcfp) mols = read_smiles_ultraiterator() for molindex, (molid, smiles) in enumerate(islice(mols, start, None, step)): if logeach > 0 and molindex > 0 and not molindex % logeach: info('Molecule %d' % molindex) processor(molid, smiles) processor(_END_MOLID, None)
def save_molids(data_dir, name, molids, overwrite=False): """Save molids in plain text in the data directory.""" molids_file = op.join(data_dir, '%s.ids' % name) if not op.isfile(molids_file) or overwrite: info('Saving molids...') with open(molids_file, 'w') as writer: for molid in molids: writer.write(molid) writer.write('\n')
def malaria_ecfp_parallel_results_iterator(prefix='', log=True): """Iterates over the files resulting from the computation of ecfps using the function ecfp.""" weirdfps = glob(op.join(_MALARIA_ECFPS_PARALLEL_RESULTS_DIR, '%s*.weirdfps' % prefix)) weirdfps = _sort_by_start(weirdfps) for fn in weirdfps: if log: info(fn) with gzip.open(fn) as reader: for line in reader: yield line
def do_trees_submissions(do_confirmatory=True, do_heldout=True, do_screening=True): compute_submissions(prefix='trees', dest_dir=MALARIA_TREES_EXPERIMENT_ROOT, deployers=trees_deployers, molids_provider=trees_molids, y_provider=trees_y, do_confirmatory=do_confirmatory, do_heldout=do_heldout, do_screening=do_screening) info('Submissions computed!')
def catalog_malaria_mols(overwrite=False, checks=False): """Bootstrap the malaria catalogs.""" to_catalog = ( (op.join(MALARIA_DATA_ROOT, 'rdkit', 'mols', 'unl'), read_unlabelled_smiles), (op.join(MALARIA_DATA_ROOT, 'rdkit', 'mols', 'lab'), read_labelled_only_smiles), (op.join(MALARIA_DATA_ROOT, 'rdkit', 'mols', 'scr'), read_screening_smiles), ) for path, molit in to_catalog: build_benchmark_check_rdkmols_catalog(path, molit=molit, checks=checks, overwrite=overwrite) info('ALL DONE')
def compute_submissions(prefix, dest_dir, deployers, molids_provider, y_provider, do_confirmatory=True, do_heldout=True, do_screening=True, confirmatory_top=500, scr_top=1000): info('Computing submissions for %s' % prefix) mc = MalariaCatalog() # For performance, maybe this should be singleton... if do_confirmatory: compute_confirmatory(deployers, molids_provider, outfile=op.join(dest_dir, '%s_hitSelection.txt' % prefix), y_provider=y_provider, select_top=confirmatory_top) def do_predict(dset, select_top=None): info('Computing predictions for %s: %s' % (prefix, dset)) _, scores_averaged = compute_heldout(dset, deployers, molids_provider, op.join(dest_dir, '%s_%s-averaged.txt' % (prefix, dset)), y_provider=y_provider, mc=mc, select_top=select_top) _, scores_linr = compute_heldout(dset, deployers, molids_provider, op.join(dest_dir, '%s_%s-stacker=linr.txt' % (prefix, dset)), y_provider=y_provider, stacker=LinearRegression(), mc=mc, select_top=select_top) info('Computing kendall-tau (go take a nap if there are a lot of examples...)') info('%s:%s - Kendall-tau avg vs linr: %.2f' % (prefix, dset, kendalltau(scores_linr, scores_averaged))) if do_heldout: do_predict('unl') if do_screening: do_predict('scr', select_top=scr_top)
def compute_confirmatory(deployers, molids_provider, outfile, y_provider=None, select_top=500, mc=None): """Scores and rankings on plain-average for the labelled / ambiguous dataset.""" # Labelled Xlab, f_names = deployers(dset='lab') info('AUC after plain averaging (bagging like): %.3f' % roc_auc_score(y_provider(), np.nanmean(Xlab, axis=1))) # Ambiguous Xamb, _ = deployers(dset='amb') # All together X = np.vstack((Xlab, Xamb)) # Scores are just plain averages scores = np.nanmean(X, axis=1) # Get the molids, smiles, labels, pec50 lab_molids = molids_provider(dset='lab') amb_molids = molids_provider(dset='amb') molids = np.hstack((lab_molids, amb_molids)) if mc is None: mc = MalariaCatalog() labels = mc.molids2labels(molids) pec50s = mc.molids2pec50s(molids) smiles = mc.molids2smiless(molids) # Rankings ranks, (sscores, smolids, slabels, spec50s, ssmiles) = \ rank_sort(scores, (scores, molids, labels, pec50s, smiles), reverse=True, select_top=select_top) # N.B. # if analyzing ranking variability, use instead # scores2rankings() # Save for submission with open(outfile, 'w') as writer: for molid, smiles, score in zip(smolids, ssmiles, sscores): writer.write('%s,%s,%.6f\n' % (molid, smiles, score)) # Create and save a pandas series to allow further stacking s = Series(data=scores, index=molids) s.to_pickle(op.join(op.splitext(outfile)[0] + '.pkl')) return molids, scores
def process(molid, smiles): if molid is _END_MOLID: h5.close() return ne = len(molids) try: molids.resize((ne + 1,)) molids[ne] = molid mol = to_rdkit_mol(smiles) descs.resize((ne + 1, nf)) descs[ne, :] = computer.compute(mol)[0] except: info('Failed molecule %s: %s' % (molid, smiles)) descs[ne, :] = [np.nan] * nf
def _molidsmiles_it(start=0, step=46, mols=None, processor=None, logeach=500): """Iterates (molindex, molid, smiles) triplets skipping step molecules in each iteration. This is useful for evenly splitting workloads between processors / machines. Parameters: - start: the index of the first pair to consider - step: how many molecules are skipped on each iteration - mols: an iterator (molid, smiles) - processor: a function that gets called for each pair; when the iterator is exhausted, (_END_MOLID, None) is sent. """ if mols is None: mols = read_smiles_ultraiterator() for molindex, (molid, smiles) in enumerate(islice(mols, start, None, step)): if logeach > 0 and molindex > 0 and not molindex % logeach: info('Molecule %d' % molindex) processor(molid, smiles) processor(_END_MOLID, None)
def detect_duplicate_features(transductive=False, verbose=False): """Detect exact duplicated features in the malaria dataset, returning a list of duplicated groups (column indices). Here duplicated is very practically defined as "appearing in the same molecules accross the malaria dataset". """ # TODO: this is really memory intensive, make streaming (over the columns...) # TODO: manage ambiguous... # Are there many singleton features collapsed? if transductive: Xlab = MalariaFingerprintsManager(dset='lab', keep_ambiguous=False).X() Xunl = MalariaFingerprintsManager(dset='unl', keep_ambiguous=True).X() Xscr = MalariaFingerprintsManager(dset='scr', keep_ambiguous=True).X() X = vstack((Xlab, Xunl, Xscr)) else: X = MalariaFingerprintsManager(dset='lab', keep_ambiguous=False).X() info('MatrixMol Feature Duplicate detection') info('We are dealing with a matrix as big as %d molecules and %d features' % X.shape) ne, nf = X.shape X = X.tocsc() X.indices.flags.writeable = False # Make the views from this array hashable groups = defaultdict(lambda: array('I')) for i in xrange(nf): xi = X.indices[X.indptr[i]:X.indptr[i+1]:] groups[xi.data].append(i) if verbose and i > 0 and not i % 1000000: info('%d of %d substructures hashed according to the molecules they pertain' % (i, nf)) return groups.values()
def trees_deploy(dest_file=MALARIA_TREES_DEPLOYMENT_H5): """Generates predictions for unlabelled datasets.""" df = trees_results_to_pandas() h5 = h5py.File(dest_file, 'w') # Choose a few good results (maybe apply diversity filters or ensemble selection or...) deployers = df[(df.model_num_trees == 6000)] info('Deploying %d tree ensembles' % len(deployers)) for i, res in enumerate(deployers.result): f_name = '%s__%s' % (res.model_setup_id(), res.eval_setup_id()) # What about the data setup? # Here it works but in general not # Save it all... # (a new dataset with all the coords # and the result path) info(f_name) # Lab if '%s/lab' % f_name not in h5: h5['%s/lab' % f_name] = res.scores('lab')[:, 1].astype(np.float32) # Amb if '%s/amb' % f_name not in h5: h5['%s/amb' % f_name] = res.scores('amb')[:, 1].astype(np.float32) # Unl if '%s/unl' % f_name not in h5: h5['%s/unl' % f_name] = res.scores('unl')[:, 1].astype(np.float32) # Scr if '%s/scr' % f_name not in h5: h5['%s/scr' % f_name] = fix_streaming_scoring_bug_results(res.scores('scr')[:, 1].astype(np.float32)) assert h5['%s/scr' % f_name].hape[0] == 5488144, 'Streaming rdkf bug striking back...' h5.close()
def predict_malaria_unlabelled(model, h5, rf_amb=None, rf_scr=None, rf_unl=None, chunksize=0): """Use the model to cast predictions for the datasets, storing them where appropriate in the h5 file and allowing predicition on streams of the screening dataset. """ # Ambiguous examples if rf_amb is not None: info('Scoring amb...') h5['amb'] = model.predict_proba(rf_amb.X()).astype(np.float32) # Unlabelled (competition) examples if rf_unl is not None: info('Scoring unl...') h5['unl'] = model.predict_proba(rf_unl.X()).astype(np.float32) # Screening examples if rf_scr is not None: info('Scoring scr...') if chunksize <= 0: h5['scr'] = model.predict_proba(rf_scr.X()).astype(np.int32) else: scr = h5.create_dataset('scr', shape=(rf_scr.ne_stream(), 2), dtype=np.float32) for i, x in enumerate(rf_scr.X_stream(chunksize=chunksize)): base = i * chunksize info('\t num_scr_examples: %d' % base) scr[base:base + chunksize] = model.predict_proba(x)
def do_predict(dset, select_top=None): info('Computing predictions for %s: %s' % (prefix, dset)) _, scores_averaged = compute_heldout(dset, deployers, molids_provider, op.join(dest_dir, '%s_%s-averaged.txt' % (prefix, dset)), y_provider=y_provider, mc=mc, select_top=select_top) _, scores_linr = compute_heldout(dset, deployers, molids_provider, op.join(dest_dir, '%s_%s-stacker=linr.txt' % (prefix, dset)), y_provider=y_provider, stacker=LinearRegression(), mc=mc, select_top=select_top) info('Computing kendall-tau (go take a nap if there are a lot of examples...)') info('%s:%s - Kendall-tau avg vs linr: %.2f' % (prefix, dset, kendalltau(scores_linr, scores_averaged)))
def logreg_deploy(dest_file=MALARIA_LOGREGS_DEPLOYMENT_H5): """Generates predictions for unlabelled datasets.""" df = logreg_results_to_pandas() h5 = h5py.File(dest_file, 'w') # Choose a few good results (maybe apply diversity filters or ensemble selection or...) deployment_cond_1 = (df.cv_seed < 5) & \ (df.num_present_folds == df.num_cv_folds) & \ (df.penalty == 'l1') & \ (df.C == 1) & \ (df.class_weight == 'auto') & \ (df.tol == 1E-4) & \ (df.folder_size < 1) & \ (df.folder_seed == -1) & \ (df.auc_mean > 0.92) deployment_cond_2 = (df.num_present_folds == df.num_cv_folds) & \ (df.penalty == 'l2') & \ (df.C == 5) & \ (df.class_weight == 'auto') & \ (df.tol == 1E-4) & \ (df.folder_size < 1) & \ (df.folder_seed == -1) & \ (df.auc_mean > 0.93) deployers = df[deployment_cond_1 | deployment_cond_2] info('Deploying %d logistic regressors' % len(deployers)) # We will have 40 "features", one for each deployer # For lab it will just be the test scores # For amb, unl and scr it will be the average of the scores for each cv fold rf_lab, rf_amb, rf_unl, rf_scr = malaria_logreg_fpt_providers(None) for i, res in enumerate(deployers.result): f_name = '%s__%s' % (res.model_setup_id(), res.eval_setup_id()) # What about the data setup? # Here it works but in general not # Save it all... # (a new dataset with all the coords # and the result path) print f_name # Lab if '%s/lab' % f_name not in h5: h5['%s/lab' % f_name] = res.scores()[:, 1].astype(np.float32) # Amb models = [res.fold_model(fold) for fold in res.present_folds()] if '%s/amb' % f_name not in h5: h5['%s/amb' % f_name] = np.nanmean([model.predict_proba(rf_amb.X())[:, 1] for model in models], axis=0).astype(np.float32) # Unl if '%s/unl' % f_name not in h5: h5['%s/unl' % f_name] = np.nanmean([model.predict_proba(rf_unl.X())[:, 1] for model in models], axis=0).astype(np.float32) # Scr if '%s/scr' % f_name not in h5: h5['%s/scr' % f_name] = np.nanmean([model.predict_proba(rf_scr.X())[:, 1] for model in models], axis=0).astype(np.float32) h5.close()
def build_benchmark_check_rdkmols_catalog(mmapdir, molit=read_labelled_only_smiles, checks=False, overwrite=False): """Builds a memmapped catalog {molid->rdkbytes} from a (molid, smiles) iterator. tests it and compares to sequential recreation of the molecules from smiles. """ # Build the catalog info('Building %s catalog...' % mmapdir) start = time() mmm = MemMappedMols(mmapdir) if not overwrite and mmm.has_catalog(): info('Already computed, skipping.') else: mmm.save_from_smiles_iterator(molit()) info('Time taken to build the memmapped file: %.2f seconds' % (time() - start)) if not checks: return # Load the catalog mmms = MemMappedMols(mmapdir) # Lame benchmark - memmapped contiguous info('Benchmarking contiguous memmap reading') start = time() molcount = 0 # noinspection PyTypeChecker for molid in mmms.molids(): mmms.mol(molid) molcount += 1 info('Time taken to read the memmapped %d mols (contiguous): %.2f seconds' % (molcount, time() - start)) info('Benchmarking random memmap reading') start = time() molcount = 0 for molid in set(mmms.molids()): mmms.mol(molid) molcount += 1 info('Time taken to read the memmapped %d mols (random): %.2f seconds' % (molcount, time() - start)) # Lame benchmark - from smiles info('Benchmarking reading from the original file') start = time() molcount = 0 for _, smiles in molit(): Chem.MolFromSmiles(smiles) molcount += 1 info('Time taken to read the smiled %d mols: %.2f seconds' % (molcount, time() - start)) # Exhaustive linear test that all mols are correctly stored info('Making sure that all is OKish') for molid, smiles in molit(): emol = Chem.MolFromSmiles(smiles) if emol is None: if not mmms.mol(molid) is None: warning('Molecule %s with original smiles %s should not be parsed from the binary store' % (molid, smiles)) else: if mmms.mol(molid) is not None: if not Chem.MolToSmiles(emol) == Chem.MolToSmiles(mmms.mol(molid)): warning('Molecule %s with original smiles %s do not reconstruct properly: \n\t(%s != %s)' % (molid, smiles, Chem.MolToSmiles(emol), Chem.MolToSmiles(mmms.mol(molid)))) info('All is OKish')
def clean_results_pre_infojson_bug_fix(): results = ResultInDisk.collect_results_under_dir(MALARIA_LOGREGS_EXPERIMENT_ROOT, factory=malaria_result_factory) bad_results = [res for res in results if not op.isfile(op.join(res.eval_dir, 'info.json'))] for res in bad_results: info('Bye %s' % res.eval_dir)
def logreg_deploy(dest_file=None, with_bug=False): """ Generates predictions for the competition unlabelled datasets, saving them in HDF5 files. Generates one prediction per molecule and cross-validation experiment: - For the labelled set, the prediction is given by the model of the run where the molecule was in the testing set. - For the other sets, the predictions are averages of all the models built during cross-validation. Note that at the time of submitting there was a bug that made these predictions be just the one of the last fold (see `with_bug` parameter). Parameters ---------- dest_file : string or None, default None Path to the HDF5 to store the prediction values. There will be as many groups in there as deployed models. Each group will contain 4 datasets: - lab: predicitions on the labelled dataset - amb: predictions on the ambiguously labelled compounds - unl: predictions in the held-out competition set - scr: predictions in the screening dataset with_bug : bool, default False If True, predictions will be generated as for the competion (taking only the last fold of each experiment into account). If False, predictions will be generated as initially intended (averaging all the folds for each experiment). This bug does not affect the labelled scores. Returns ------- The path to the HDF5 file where the scores have been saved. Side effects ------------ The HDF5 file is created """ if dest_file is None: dest_file = malaria_logreg_deployers_file(with_bug=with_bug) results = logreg_experiments_to_deploy().result info('Deploying %d logistic regression experiments (%d classifiers)' % ( len(results), sum(len(result.present_folds()) for result in results))) # We will have a few "features" for each deployer # For lab it will just be the test scores # For amb, unl and scr it will be the average of the scores for each cv fold rf_lab, rf_amb, rf_unl, rf_scr = malaria_logreg_fpt_providers(None) with h5py.File(dest_file, 'w') as h5: for i, res in enumerate(results): # Deployer id f_name = '%s__%s' % (res.model_setup_id(), res.eval_setup_id()) # Lab if '%s/lab' % f_name not in h5: h5['%s/lab' % f_name] = res.scores()[:, 1].astype(np.float32) # Get result models models = [res.fold_model(fold, with_bug=with_bug) for fold in res.present_folds()] # Amb if '%s/amb' % f_name not in h5: h5['%s/amb' % f_name] = np.nanmean([model.predict_proba(rf_amb.X())[:, 1] for model in models], axis=0).astype(np.float32) # Unl if '%s/unl' % f_name not in h5: h5['%s/unl' % f_name] = np.nanmean([model.predict_proba(rf_unl.X())[:, 1] for model in models], axis=0).astype(np.float32) # Scr if '%s/scr' % f_name not in h5: h5['%s/scr' % f_name] = np.nanmean([model.predict_proba(rf_scr.X())[:, 1] for model in models], axis=0).astype(np.float32) return dest_file
def fit_logregs(dest_dir=MALARIA_LOGREGS_EXPERIMENT_ROOT, # Logreg params logreg_penalty='l1', logreg_C=1.0, logreg_class_weight_auto=False, logreg_dual=False, logreg_tol=1e-4, logreg_fit_intercept=True, logreg_intercept_scaling=1, # CV params num_cv_folds=10, cv_seeds=(0,), save_unlabelled_predictions=False, save_fold_model=False, min_fold_auc=0.88, # Fingerprint folding params fingerprint_folder_seed=0, fingerprint_fold_size=1023, # Computational requirements params force=False, chunksize=1000000): """Logistic regression experiment using the liblinear wrapper in sklearn. Generates cross-val results """ ### TODO Remove if logreg_tol < 1E-5: info('Ignoring long intolerant experiments') return info('Malaria logregs experiment') # Command line type inference is rotten... logreg_C = float(logreg_C) logreg_tol = float(logreg_tol) logreg_intercept_scaling = float(logreg_intercept_scaling) num_cv_folds = int(num_cv_folds) min_fold_auc = float(min_fold_auc) fingerprint_folder_seed = int(fingerprint_folder_seed) fingerprint_fold_size = int(fingerprint_fold_size) chunksize = int(chunksize) # Example providers folder = None if fingerprint_fold_size < 1 else MurmurFolder(seed=fingerprint_folder_seed, fold_size=fingerprint_fold_size) rf_lab, rf_amb, rf_unl, rf_scr = malaria_logreg_fpt_providers(folder) info('Data description: %s' % rf_lab.configuration().id(full=True)) # Experiment context: data data_id = rf_lab.configuration().id(full=True) data_dir = op.join(dest_dir, data_id) ensure_dir(data_dir) for cv_seed in cv_seeds: # Command line type inference is rotten... cv_seed = int(cv_seed) # Deterministic randomness my_rng = np.random.RandomState(seed=cv_seed) # Experiment context: model logreg_params = OrderedDict(( ('penalty', logreg_penalty), ('C', logreg_C), ('class_weight', 'auto' if logreg_class_weight_auto else None), ('dual', logreg_dual), ('tol', logreg_tol), ('fit_intercept', logreg_fit_intercept), ('intercept_scaling', logreg_intercept_scaling), ('random_state', my_rng.randint(low=0, high=1000 ** 4)), )) model_setup = LogisticRegression(**logreg_params) model_id = 'skllogreg__%s' % '__'.join(['%s=%s' % (k, str(v)) for k, v in logreg_params.iteritems()]) model_dir = op.join(data_dir, model_id) ensure_dir(model_dir) info('Model: %s' % model_id) # Experiment context: eval eval_id = 'cv__cv_seed=%d__num_folds=%d' % (cv_seed, num_cv_folds) eval_dir = op.join(model_dir, eval_id) ensure_dir(eval_dir) info('Eval: %d-fold cross validation (seed=%d)' % (num_cv_folds, cv_seed)) # Already done? info_file = op.join(eval_dir, 'info.json') if op.isfile(info_file) and not force: info('\tAlready done, skipping...') return # Oh well, a lot have been done up to here... rework somehow # Anytime we see this file, we know we need to stop stop_computing_file = op.join(eval_dir, 'STOP_BAD_FOLD') #--------- #--------- Time to work! #--------- # Save model config joblib.dump(model_setup, op.join(model_dir, 'model_setup.pkl'), compress=3) # Read labelled data in info('Reading data...') X, y = rf_lab.Xy() info('ne=%d; nf=%d' % rf_lab.X().shape) # Save molids... a bit too ad-hoc... save_molids(data_dir, 'lab', rf_lab.ids()) if save_unlabelled_predictions: save_molids(data_dir, 'unl', rf_unl.ids()) save_molids(data_dir, 'scr', rf_scr.ids()) save_molids(data_dir, 'amb', rf_amb.ids()) # Save folding information. # By now, all the folds have already been computed: # - because we cached X # - and in this case we are warranted that no new unfolded features will appear at test time if folder is not None: info('Saving the map folded_features -> unfolded_feature...') folded2unfolded_file = op.join(data_dir, 'folded2unfolded.h5') if not op.isfile(folded2unfolded_file): with h5py.File(folded2unfolded_file) as h5: h5['f2u'] = folder.folded2unfolded() folder_light_file = op.join(data_dir, 'folder.pkl') if not op.isfile(folder_light_file): folder_light = copy(folder) # Shallow copy folder_light.clear_cache() joblib.dump(folder_light, folder_light_file, compress=3) # Cross-val splitter cver = cv_splits(num_points=len(y), Y=y, num_folds=num_cv_folds, rng=my_rng, stratify=True) # Fit and classify for cv_fold_num in xrange(num_cv_folds): fold_info_file = op.join(eval_dir, 'fold=%d__info.json' % cv_fold_num) if op.isfile(fold_info_file): info('Fold %d already done, skipping' % cv_fold_num) continue if op.isfile(stop_computing_file): info('Bad fold detected, no more computations required') break # Split into train/test train_i, test_i = cver(cv_fold_num) Xtrain, ytrain = X[train_i, :], y[train_i] Xtest, ytest = X[test_i, :], y[test_i] # Copy the model... model = clone(model_setup) start = time() info('Training...') model.fit(Xtrain, ytrain) train_time = time() - start info('Model fitting has taken %.2f seconds' % train_time) if save_fold_model: info('Saving trained model') joblib.dump(model, op.join(eval_dir, 'fold=%d__fitmodel.pkl' % cv_fold_num), compress=3) info('Predicting and saving results...') with h5py.File(op.join(eval_dir, 'fold=%d__scores.h5' % cv_fold_num), 'w') as h5: start = time() # Test indices h5['test_indices'] = test_i # Model h5['logreg_coef'] = model.coef_ h5['logreg_intercept'] = model.intercept_ # Test examples info('Scoring test...') scores_test = model.predict_proba(Xtest) fold_auc = roc_auc_score(ytest, scores_test[:, 1]) fold_enrichment5 = enrichment_at(ytest, scores_test[:, 1], percentage=0.05) info('Fold %d ROCAUC: %.3f' % (cv_fold_num, fold_auc)) info('Fold %d Enrichment at 5%%: %.3f' % (cv_fold_num, fold_enrichment5)) h5['test'] = scores_test.astype(np.float32) if save_unlabelled_predictions: predict_malaria_unlabelled(model, h5, rf_amb=rf_amb, rf_scr=rf_scr, rf_unl=rf_unl, chunksize=chunksize) test_time = time() - start info('Predicting has taken %.2f seconds' % test_time) # Finally save meta-information for the fold metainfo = mlexp_info_helper( title='malaria-trees-oob', data_setup=data_id, model_setup=model_id, exp_function=giveupthefunc(), ) metainfo.update(( ('train_time', train_time), ('test_time', test_time), ('auc', fold_auc), ('enrichment5', fold_enrichment5), )) with open(fold_info_file, 'w') as writer: json.dump(metainfo, writer, indent=2, sort_keys=False) # One last thing, should we stop now? if fold_auc < min_fold_auc: stop_message = 'The fold %d was bad (auc %.3f < %.3f), skipping the rest of the folds' % \ (cv_fold_num, fold_auc, min_fold_auc) info(stop_message) with open(stop_computing_file, 'w') as writer: writer.write(stop_message) # Summarize cross-val in the info file metainfo = mlexp_info_helper( title='malaria-trees-oob', data_setup=data_id, model_setup=model_id, exp_function=giveupthefunc(), ) metainfo.update(( ('num_cv_folds', num_cv_folds), ('cv_seed', cv_seed), )) metainfo.update(logreg_params.items()) with open(info_file, 'w') as writer: json.dump(metainfo, writer, indent=2, sort_keys=False)
def fit(dest_dir=MALARIA_TREES_EXPERIMENT_ROOT, seeds=(0, 1, 2, 3, 4), num_treess=(10, 6000, 4000, 2000, 1000, 500, 20, 50, 100), save_trained_models=False, chunksize=200000, num_threads=None, force=False): # Generates OOB results info('Malaria trees experiment') # Guess the number of threads if num_threads is None: num_threads = cpu_count() info('Will use %d threads' % num_threads) # Example providers info('Reading data...') rf_lab = MalariaRDKFsExampleSet() X, y = rf_lab.Xy() rf_unl = MalariaRDKFsExampleSet(dset='unl', remove_ambiguous=False) rf_scr = MalariaRDKFsExampleSet(dset='scr', remove_ambiguous=False) rf_amb = MalariaRDKFsExampleSet(dset='amb') # A bit of logging info('Data description: %s' % rf_lab.configuration().id(nonids_too=True)) info('ne=%d; nf=%d' % rf_lab.X().shape) # Experiment context: data data_id = rf_lab.configuration().id(nonids_too=True) # TODO: bring hashing from oscail data_dir = op.join(dest_dir, data_id) ensure_dir(data_dir) # Save molids... a bit too ad-hoc... info('Saving molids...') save_molids(data_dir, 'lab', rf_lab.ids()) save_molids(data_dir, 'unl', rf_unl.ids()) save_molids(data_dir, 'scr', rf_scr.ids()) save_molids(data_dir, 'amb', rf_amb.ids()) # Main loop - TODO: robustify with try and continue for etc, seed, num_trees in product((True, False), seeds, num_treess): # Configure the model if etc: model = ExtraTreesClassifier(n_estimators=num_trees, n_jobs=num_threads, bootstrap=True, oob_score=True, random_state=seed) else: model = RandomForestClassifier(n_estimators=num_trees, n_jobs=num_threads, oob_score=True, random_state=seed) # Experiment context: model model_id = 'trees__etc=%r__num_trees=%d__seed=%d' % (etc, num_trees, seed) # TODO: bring self-id from oscail model_dir = op.join(data_dir, model_id) ensure_dir(model_dir) info('Model: %s' % model_id) # Experiment context: eval eval_id = 'oob' eval_dir = op.join(model_dir, eval_id) ensure_dir(eval_dir) info('Eval: OOB (Out Of Bag)') # Already done? info_file = op.join(eval_dir, 'info.json') if op.isfile(info_file) and not force: info('\tAlready done, skipping...') continue # Save model config joblib.dump(model, op.join(model_dir, 'model_setup.pkl'), compress=3) # Train-full info('Training...') start = time() model.fit(X, y) train_time = time() - start # This is also test-time, as per OOB=True # Save trained model? - yeah, lets do it under oob if save_trained_models: joblib.dump(model, op.join(eval_dir, 'model_trained.pkl'), compress=3) # OOB score, auc and enrichment oob_score = model.oob_score_ oob_scores = model.oob_decision_function_ oob_scores_not_missing = fill_missing_scores(oob_scores[:, 1]) auc = roc_auc_score(y, oob_scores_not_missing) enrichment5 = enrichment_at(y, oob_scores_not_missing, percentage=0.05) info('OOB AUC: %.2f' % auc) info('OOB Enrichment at 5%%: %.2f' % enrichment5) info('OOB Accuracy: %.2f' % oob_score) # Save scores and importances info('Saving results...') with h5py.File(op.join(eval_dir, 'oob_auc=%.2f__scores.h5' % auc), 'w') as h5: start = time() # Feature importances h5['f_names'] = rf_lab.fnames() h5['f_importances'] = model.feature_importances_ # Labelled (development) examples info('Scoring lab...') h5['lab'] = oob_scores.astype(np.float32) info('Scoring amb...') h5['amb'] = model.predict_proba(rf_amb.X()).astype(np.float32) # Unlabelled (competition) examples info('Scoring unl...') h5['unl'] = model.predict_proba(rf_unl.X()).astype(np.float32) # Unlabelled (screening) examples info('Scoring scr...') if chunksize <= 0: h5['scr'] = model.predict_proba(rf_scr.X()).astype(np.int32) else: scr = h5.create_dataset('scr', shape=(rf_scr.ne_stream(), 2), dtype=np.float32) for i, x in enumerate(rf_scr.X_stream(chunksize=chunksize)): base = i * chunksize info('\t num_scr_examples: %d' % base) scr[base:base + chunksize] = model.predict_proba(x) test_time = time() - start # Finally save meta-information metainfo = mlexp_info_helper( title='malaria-trees-oob', data_setup=data_id, model_setup=model_id, exp_function=fit, ) metainfo.update(( ('train_time', train_time), ('test_time', test_time), ('oob_auc', auc), ('oob_enrichment5', enrichment5), ('oob_accuracy', oob_score), )) with open(info_file, 'w') as writer: json.dump(metainfo, writer, indent=2, sort_keys=False)