def save_submission(sub, outfile, select_top=500): # Get the smiles smiles = mc.molids2smiless(sub.index) # Rankings ranks, (sscores, smolids, ssmiles) = \ rank_sort(sub.values, (sub.values, sub.index.values, smiles), reverse=True, select_top=select_top) # Save for submission with open(outfile, 'w') as writer: for molid, smiles, score in izip(smolids, ssmiles, sscores): writer.write('%s,%s,%.6f\n' % (molid, smiles, score))
def compute_confirmatory(deployers, molids_provider, outfile, y_provider=None, select_top=500, mc=None): """Scores and rankings on plain-average for the labelled / ambiguous dataset.""" # Labelled Xlab, f_names = deployers(dset='lab') info('AUC after plain averaging (bagging like): %.3f' % roc_auc_score(y_provider(), np.nanmean(Xlab, axis=1))) # Ambiguous Xamb, _ = deployers(dset='amb') # All together X = np.vstack((Xlab, Xamb)) # Scores are just plain averages scores = np.nanmean(X, axis=1) # Get the molids, smiles, labels, pec50 lab_molids = molids_provider(dset='lab') amb_molids = molids_provider(dset='amb') molids = np.hstack((lab_molids, amb_molids)) if mc is None: mc = MalariaCatalog() labels = mc.molids2labels(molids) pec50s = mc.molids2pec50s(molids) smiles = mc.molids2smiless(molids) # Rankings ranks, (sscores, smolids, slabels, spec50s, ssmiles) = \ rank_sort(scores, (scores, molids, labels, pec50s, smiles), reverse=True, select_top=select_top) # N.B. # if analyzing ranking variability, use instead # scores2rankings() # Save for submission with open(outfile, 'w') as writer: for molid, smiles, score in zip(smolids, ssmiles, sscores): writer.write('%s,%s,%.6f\n' % (molid, smiles, score)) # Create and save a pandas series to allow further stacking s = Series(data=scores, index=molids) s.to_pickle(op.join(op.splitext(outfile)[0] + '.pkl')) return molids, scores
def compute_heldout(dset, deployers, molids_provider, outfile, y_provider=None, stacker=None, select_top=None, mc=None): """Predictions for the held-out sets.""" X, _ = deployers(dset=dset) # Stacking or averaging? if stacker is not None: Xlab, _ = deployers(dset='lab') y = y_provider() stacker.fit(Xlab, y) # Careful: Xlab columns can be extremelly collinear... if True: scores = stacker.predict(X) else: scores = stacker.predict_proba(X)[:, 1] else: scores = np.nanmean(X, axis=1) # Get the molids, smiles if mc is None: mc = MalariaCatalog() molids = molids_provider(dset=dset) smiles = mc.molids2smiless(molids) # Rankings ranks, (sscores, smolids, ssmiles) = \ rank_sort(scores, (scores, molids, smiles), reverse=True, select_top=select_top) # Save for submission with open(outfile, 'w') as writer: for molid, smiles, score in izip(smolids, ssmiles, sscores): writer.write('%s,%s,%.6f\n' % (molid, smiles, score)) # Create and save a pandas series to allow further stacking s = Series(data=scores, index=molids) s.to_pickle(op.join(op.splitext(outfile)[0] + '.pkl')) return molids, scores
(df.folder_seed < 1) & (df.folder_size == 0)) results_for_fs = df[conds].result importances = [] for res in results_for_fs: importances += [res.logreg_coefs(fold).ravel() for fold in res.present_folds()] mean_importance = np.mean(importances, axis=0) std_importance = np.std(importances, axis=0) features = np.arange(len(mean_importance)) # Importance is in the absolute value, sign indicates positive/negative feature ranks, (sfeatures, smean_importance, sstd_importance) =\ rank_sort(mean_importance, (features, mean_importance, std_importance)) # Negative print('Super-negative features') for f, mi, si in izip(sfeatures[:10], smean_importance[:10], sstd_importance[:10]): print('Feature: %d (%.2f +/- %.2f)' % (f, mi, si)) # Positives print('Super-positive features') for f, mi, si in izip(sfeatures[-10:], smean_importance[-10:], sstd_importance[-10:]): print('Feature: %d (%.2f +/- %.2f)' % (f, mi, si)) # Some preparations... rng = np.random.RandomState(52) mfm = MalariaFingerprintsManager(dset='lab') mc = MalariaCatalog()