예제 #1
0
def compute_confirmatory(deployers,
                         molids_provider,
                         outfile,
                         y_provider=None,
                         select_top=500,
                         mc=None):
    """Scores and rankings on plain-average for the labelled / ambiguous dataset."""

    # Labelled
    Xlab, f_names = deployers(dset='lab')
    info('AUC after plain averaging (bagging like): %.3f' % roc_auc_score(y_provider(),
                                                                          np.nanmean(Xlab, axis=1)))
    # Ambiguous
    Xamb, _ = deployers(dset='amb')
    # All together
    X = np.vstack((Xlab, Xamb))

    # Scores are just plain averages
    scores = np.nanmean(X, axis=1)

    # Get the molids, smiles, labels, pec50
    lab_molids = molids_provider(dset='lab')
    amb_molids = molids_provider(dset='amb')
    molids = np.hstack((lab_molids, amb_molids))

    if mc is None:
        mc = MalariaCatalog()
    labels = mc.molids2labels(molids)
    pec50s = mc.molids2pec50s(molids)
    smiles = mc.molids2smiless(molids)

    # Rankings
    ranks, (sscores, smolids, slabels, spec50s, ssmiles) = \
        rank_sort(scores, (scores, molids, labels, pec50s, smiles),
                  reverse=True,
                  select_top=select_top)

    # N.B.
    # if analyzing ranking variability, use instead
    # scores2rankings()

    # Save for submission
    with open(outfile, 'w') as writer:
        for molid, smiles, score in zip(smolids, ssmiles, sscores):
            writer.write('%s,%s,%.6f\n' % (molid, smiles, score))

    # Create and save a pandas series to allow further stacking
    s = Series(data=scores, index=molids)
    s.to_pickle(op.join(op.splitext(outfile)[0] + '.pkl'))

    return molids, scores
예제 #2
0
def compute_heldout(dset,
                    deployers,
                    molids_provider,
                    outfile,
                    y_provider=None,
                    stacker=None,
                    select_top=None,
                    mc=None):
    """Predictions for the held-out sets."""
    X, _ = deployers(dset=dset)

    # Stacking or averaging?
    if stacker is not None:
        Xlab, _ = deployers(dset='lab')
        y = y_provider()
        stacker.fit(Xlab, y)  # Careful: Xlab columns can be extremelly collinear...
        if True:
            scores = stacker.predict(X)
        else:
            scores = stacker.predict_proba(X)[:, 1]
    else:
        scores = np.nanmean(X, axis=1)

    # Get the molids, smiles
    if mc is None:
        mc = MalariaCatalog()
    molids = molids_provider(dset=dset)
    smiles = mc.molids2smiless(molids)

    # Rankings
    ranks, (sscores, smolids, ssmiles) = \
        rank_sort(scores, (scores, molids, smiles), reverse=True, select_top=select_top)

    # Save for submission
    with open(outfile, 'w') as writer:
        for molid, smiles, score in izip(smolids, ssmiles, sscores):
            writer.write('%s,%s,%.6f\n' % (molid, smiles, score))

    # Create and save a pandas series to allow further stacking
    s = Series(data=scores, index=molids)
    s.to_pickle(op.join(op.splitext(outfile)[0] + '.pkl'))

    return molids, scores