def compute_confirmatory(deployers, molids_provider, outfile, y_provider=None, select_top=500, mc=None): """Scores and rankings on plain-average for the labelled / ambiguous dataset.""" # Labelled Xlab, f_names = deployers(dset='lab') info('AUC after plain averaging (bagging like): %.3f' % roc_auc_score(y_provider(), np.nanmean(Xlab, axis=1))) # Ambiguous Xamb, _ = deployers(dset='amb') # All together X = np.vstack((Xlab, Xamb)) # Scores are just plain averages scores = np.nanmean(X, axis=1) # Get the molids, smiles, labels, pec50 lab_molids = molids_provider(dset='lab') amb_molids = molids_provider(dset='amb') molids = np.hstack((lab_molids, amb_molids)) if mc is None: mc = MalariaCatalog() labels = mc.molids2labels(molids) pec50s = mc.molids2pec50s(molids) smiles = mc.molids2smiless(molids) # Rankings ranks, (sscores, smolids, slabels, spec50s, ssmiles) = \ rank_sort(scores, (scores, molids, labels, pec50s, smiles), reverse=True, select_top=select_top) # N.B. # if analyzing ranking variability, use instead # scores2rankings() # Save for submission with open(outfile, 'w') as writer: for molid, smiles, score in zip(smolids, ssmiles, sscores): writer.write('%s,%s,%.6f\n' % (molid, smiles, score)) # Create and save a pandas series to allow further stacking s = Series(data=scores, index=molids) s.to_pickle(op.join(op.splitext(outfile)[0] + '.pkl')) return molids, scores
def compute_heldout(dset, deployers, molids_provider, outfile, y_provider=None, stacker=None, select_top=None, mc=None): """Predictions for the held-out sets.""" X, _ = deployers(dset=dset) # Stacking or averaging? if stacker is not None: Xlab, _ = deployers(dset='lab') y = y_provider() stacker.fit(Xlab, y) # Careful: Xlab columns can be extremelly collinear... if True: scores = stacker.predict(X) else: scores = stacker.predict_proba(X)[:, 1] else: scores = np.nanmean(X, axis=1) # Get the molids, smiles if mc is None: mc = MalariaCatalog() molids = molids_provider(dset=dset) smiles = mc.molids2smiless(molids) # Rankings ranks, (sscores, smolids, ssmiles) = \ rank_sort(scores, (scores, molids, smiles), reverse=True, select_top=select_top) # Save for submission with open(outfile, 'w') as writer: for molid, smiles, score in izip(smolids, ssmiles, sscores): writer.write('%s,%s,%.6f\n' % (molid, smiles, score)) # Create and save a pandas series to allow further stacking s = Series(data=scores, index=molids) s.to_pickle(op.join(op.splitext(outfile)[0] + '.pkl')) return molids, scores