Beispiel #1
0
def mols_having_best_feat(penalty='l1', c=1, num_folds=10):

    df_t3 = task3_res()

    coefs = []
    for cv_seed, group in df_t3.groupby(['cv_seed']):
        print(cv_seed, len(group))
    # Iterate over the 5 cv seeds for the same num_cv_folds:
    for _, gr in df_t3.result.items():
        # average over the different folds
        # print df_t3.C
        # print df_t3.num_cv_folds
        # print df_t3.cv_seed
        coefs.append(np.mean(np.array([gr.logreg_coefs(i).ravel() for i in range(num_folds)]), axis=0))

    av_coefs = np.mean(np.array(coefs), axis=0)
    index_of_best = np.argmax(av_coefs)
    mfm = MalariaFingerprintsManager(dset='lab')
    feat = mfm.i2s(index_of_best)
    print(feat)
    # feat = 'n1c(S(C)(=O)=O)sc(N)c1S(c)(=O)=O'
    molids = mfm.mols_with_feature(feat)
    mc = MalariaCatalog()
    mols = mc.molids2mols(molids)
    labels = mc.molids2labels(molids, as01=True)

    print(len(mols))
    draw_in_a_grid_aligned_according_to_pattern(mols, feat,
                                                op.join(MALARIA_EXPS_ROOT, 'logregs', 'Mols_having_best_fpt.png'),
                                                legends=molids, classes=labels)
Beispiel #2
0
def compute_confirmatory(deployers,
                         molids_provider,
                         outfile,
                         y_provider=None,
                         select_top=500,
                         mc=None):
    """Scores and rankings on plain-average for the labelled / ambiguous dataset."""

    # Labelled
    Xlab, f_names = deployers(dset='lab')
    info('AUC after plain averaging (bagging like): %.3f' % roc_auc_score(y_provider(),
                                                                          np.nanmean(Xlab, axis=1)))
    # Ambiguous
    Xamb, _ = deployers(dset='amb')
    # All together
    X = np.vstack((Xlab, Xamb))

    # Scores are just plain averages
    scores = np.nanmean(X, axis=1)

    # Get the molids, smiles, labels, pec50
    lab_molids = molids_provider(dset='lab')
    amb_molids = molids_provider(dset='amb')
    molids = np.hstack((lab_molids, amb_molids))

    if mc is None:
        mc = MalariaCatalog()
    labels = mc.molids2labels(molids)
    pec50s = mc.molids2pec50s(molids)
    smiles = mc.molids2smiless(molids)

    # Rankings
    ranks, (sscores, smolids, slabels, spec50s, ssmiles) = \
        rank_sort(scores, (scores, molids, labels, pec50s, smiles),
                  reverse=True,
                  select_top=select_top)

    # N.B.
    # if analyzing ranking variability, use instead
    # scores2rankings()

    # Save for submission
    with open(outfile, 'w') as writer:
        for molid, smiles, score in zip(smolids, ssmiles, sscores):
            writer.write('%s,%s,%.6f\n' % (molid, smiles, score))

    # Create and save a pandas series to allow further stacking
    s = Series(data=scores, index=molids)
    s.to_pickle(op.join(op.splitext(outfile)[0] + '.pkl'))

    return molids, scores
Beispiel #3
0
def final_merged_submissions(calibrate=False, dest_dir=MALARIA_EXPS_ROOT):
    """Very ad-hoc merge of submissions obtained with trees and logistic regressors."""

    #####
    #0 Preparations
    #####

    # Avoid circular imports
    from ccl_malaria.logregs_fit import MALARIA_LOGREGS_EXPERIMENT_ROOT
    from ccl_malaria.trees_fit import MALARIA_TREES_EXPERIMENT_ROOT

    mc = MalariaCatalog()

    def save_submission(sub, outfile, select_top=500):
        # Get the smiles
        smiles = mc.molids2smiless(sub.index)

        # Rankings
        ranks, (sscores, smolids, ssmiles) = \
            rank_sort(sub.values, (sub.values,
                                   sub.index.values,
                                   smiles), reverse=True, select_top=select_top)
        # Save for submission
        with open(outfile, 'w') as writer:
            for molid, smiles, score in izip(smolids, ssmiles, sscores):
                writer.write('%s,%s,%.6f\n' % (molid, smiles, score))

    #####
    #1 Robust merge using pandas
    #####
    def read_average_merge(root, prefix):
        hit = pd.read_pickle(op.join(root, '%s_hitSelection.pkl' % prefix))
        labels = mc.molids2labels(hit.index, as01=True)
        lab = hit[~np.isnan(labels)]
        amb = hit[np.isnan(labels)]
        unl = pd.read_pickle(op.join(root, '%s_unl-averaged.pkl' % prefix))
        scr = pd.read_pickle(op.join(root, '%s_scr-averaged.pkl' % prefix))
        return lab, amb, unl, scr
    tlab, tamb, tunl, tscr = read_average_merge(MALARIA_TREES_EXPERIMENT_ROOT, 'trees')
    llab, lamb, lunl, lscr = read_average_merge(MALARIA_LOGREGS_EXPERIMENT_ROOT, 'logreg')

    lab = DataFrame({'trees': tlab, 'logregs': llab})
    lab['labels'] = mc.molids2labels(lab.index, as01=True)
    assert np.sum(np.isnan(lab.labels)) == 0
    amb = DataFrame({'trees': tamb, 'logregs': lamb})
    unl = DataFrame({'trees': tunl, 'logregs': lunl})
    scr = DataFrame({'trees': tscr, 'logregs': lscr})

    # ATM we take it easy and just drop any NA
    lab.dropna(inplace=True)
    amb.dropna(inplace=True)
    unl.dropna(inplace=True)
    scr.dropna(inplace=True)

    #####
    #2 Calibration on labelling - careful with overfitting for hitList, do it in cross-val fashion
    #####
    def calibrate_row(row):
        calibrator = IsotonicRegression(y_min=0, y_max=1)
        x = lab[~np.isnan(lab[row])][row].values
        y = lab[~np.isnan(lab[row])]['labels'].values
        calibrator.fit(x, y)
        lab[row] = calibrator.predict(lab[row].values)
        amb[row] = calibrator.predict(amb[row].values)
        unl[row] = calibrator.predict(unl[row].values)
        scr[row] = calibrator.predict(scr[row].values)
    if calibrate:
        calibrate_row('trees')
        calibrate_row('logregs')

    #####
    #3 Average for the submission in lab-amb
    #####
    submission_lab = (lab.trees + lab.logregs) / 2
    submission_amb = (amb.trees + amb.logregs) / 2
    submission_hts = pd.concat((submission_lab, submission_amb))

    outfile = op.join(dest_dir, 'final-merged-%s-hitSelection.csv' % ('calibrated' if calibrate else 'nonCalibrated'))
    save_submission(submission_hts, outfile)

    #####
    #4 Average predictions for unlabelled
    #####
    submission_unl_avg = (unl.trees + unl.logregs) / 2
    outfile = op.join(dest_dir, 'final-%s-avg-unl.csv' % ('calibrated' if calibrate else 'nonCalibrated'))
    save_submission(submission_unl_avg, outfile, select_top=None)

    submission_scr_avg = (scr.trees + scr.logregs) / 2
    outfile = op.join(dest_dir, 'final-%s-avg-scr.csv' % ('calibrated' if calibrate else 'nonCalibrated'))
    save_submission(submission_scr_avg, outfile, select_top=1000)

    #####
    #5 Stacked (linear regression) for unlabelled
    #####
    stacker = LinearRegression()
    stacker.fit(lab[['trees', 'logregs']], lab.labels)

    submission_unl_st = Series(data=stacker.predict(unl[['trees', 'logregs']]), index=unl.index)
    outfile = op.join(dest_dir, 'final-%s-stacker=linr-unl.csv' % ('calibrated' if calibrate else 'nonCalibrated'))
    save_submission(submission_unl_st, outfile, select_top=None)

    submission_scr_st = Series(data=stacker.predict(scr[['trees', 'logregs']]), index=scr.index)
    outfile = op.join(dest_dir, 'final-%s-stacker=linr-scr.csv' % ('calibrated' if calibrate else 'nonCalibrated'))
    save_submission(submission_scr_st, outfile, select_top=1000)
Beispiel #4
0
def merge_submissions(calibrate=False,
                      select_top_scr=None,
                      with_bug=False,
                      dest_dir=MALARIA_EXPS_ROOT):
    """Very ad-hoc merge of submissions obtained with trees and logistic regressors."""

    #####
    # 0 Preparations
    #####

    # Avoid circular imports
    from ccl_malaria.logregs_fit import MALARIA_LOGREGS_EXPERIMENT_ROOT
    from ccl_malaria.logregs_analysis import malaria_logreg_file_prefix
    from ccl_malaria.trees_fit import MALARIA_TREES_EXPERIMENT_ROOT

    mc = MalariaCatalog()

    ensure_dir(dest_dir)

    def save_submission(sub, outfile, select_top=500):
        # Get the smiles
        smiles = mc.molids2smiless(sub.index)

        # Rankings
        ranks, (sscores, smolids, ssmiles) = \
            rank_sort(sub.values, (sub.values,
                                   sub.index.values,
                                   smiles), reverse=True, select_top=select_top)
        # Save for submission
        with open(outfile, 'w') as writer:
            for molid, smiles, score in zip(smolids, ssmiles, sscores):
                writer.write('%s,%s,%.6f\n' % (molid, smiles, score))

    #####
    # 1 Robust merge using pandas
    #####
    def read_average_merge(root, prefix):
        hit = pd.read_pickle(op.join(root, '%s_hitSelection.pkl' % prefix))
        labels = mc.molids2labels(hit.index, as01=True)
        lab = hit[~np.isnan(labels)]
        amb = hit[np.isnan(labels)]
        unl = pd.read_pickle(op.join(root, '%s_unl-averaged.pkl' % prefix))
        scr = pd.read_pickle(op.join(root, '%s_scr-averaged.pkl' % prefix))
        return lab, amb, unl, scr

    tlab, tamb, tunl, tscr = read_average_merge(MALARIA_TREES_EXPERIMENT_ROOT, 'trees')
    llab, lamb, lunl, lscr = read_average_merge(MALARIA_LOGREGS_EXPERIMENT_ROOT,
                                                malaria_logreg_file_prefix(with_bug=with_bug))

    lab = DataFrame({'trees': tlab, 'logregs': llab})
    lab['labels'] = mc.molids2labels(lab.index, as01=True)
    assert np.sum(np.isnan(lab['labels'])) == 0
    amb = DataFrame({'trees': tamb, 'logregs': lamb})
    unl = DataFrame({'trees': tunl, 'logregs': lunl})
    scr = DataFrame({'trees': tscr, 'logregs': lscr})

    # ATM we take it easy and just drop any NA
    lab.dropna(inplace=True)
    amb.dropna(inplace=True)
    unl.dropna(inplace=True)
    scr.dropna(inplace=True)

    #####
    # 2 Calibration on labelling - careful with overfitting for hitList, do it in cross-val fashion
    #####
    def calibrate_col(col):
        # isotonic not the best here, and faces numerical issues
        calibrator = IsotonicRegression(y_min=0, y_max=1)
        x = lab[~np.isnan(lab[col])][col].values
        y = lab[~np.isnan(lab[col])]['labels'].values
        # This worked with old sklearn
        try:
            # Old sklearn
            calibrator.fit(x.reshape(-1, 1), y)
            lab[col] = calibrator.predict(lab[col].values.reshape(-1, 1))
            amb[col] = calibrator.predict(amb[col].values.reshape(-1, 1))
            unl[col] = calibrator.predict(unl[col].values.reshape(-1, 1))
            scr[col] = calibrator.predict(scr[col].values.reshape(-1, 1))
        except ValueError:
            # Newer sklearn
            calibrator.fit(x.ravel(), y)
            lab[col] = calibrator.predict(lab[col].values.ravel())
            amb[col] = calibrator.predict(amb[col].values.ravel())
            unl[col] = calibrator.predict(unl[col].values.ravel())
            scr[col] = calibrator.predict(scr[col].values.ravel())

    if calibrate:
        calibrate_col('trees')
        calibrate_col('logregs')

    #####
    # 3 Average for the submission in lab-amb
    #####
    submission_lab = (lab.trees + lab.logregs) / 2
    submission_amb = (amb.trees + amb.logregs) / 2
    submission_hts = pd.concat((submission_lab, submission_amb))

    submission_options = '%s-%s' % (
        'calibrated' if calibrate else 'nonCalibrated',
        'lastFold' if with_bug else 'averageFolds')

    outfile = op.join(dest_dir, 'final-merged-%s-hitSelection.csv' % submission_options)
    save_submission(submission_hts, outfile)

    #####
    # 4 Average predictions for unlabelled
    #####
    submission_unl_avg = (unl.trees + unl.logregs) / 2
    outfile = op.join(dest_dir, 'final-%s-avg-unl.csv' % submission_options)
    save_submission(submission_unl_avg, outfile, select_top=None)

    submission_scr_avg = (scr.trees + scr.logregs) / 2
    outfile = op.join(dest_dir, 'final-%s-avg-scr.csv' % submission_options)
    save_submission(submission_scr_avg, outfile, select_top=select_top_scr)

    #####
    # 5 Stacked (linear regression) for unlabelled
    #####
    stacker = LinearRegression()
    stacker.fit(lab[['trees', 'logregs']], lab['labels'])

    def robust_predict(X):
        X = np.asarray(X)
        row_is_finite = np.all(np.isfinite(X), axis=1)
        scores = np.full(len(X), fill_value=np.nan)
        scores[row_is_finite] = stacker.predict(X[row_is_finite])
        return scores

    # noinspection PyArgumentList
    submission_unl_st = Series(data=robust_predict(unl[['trees', 'logregs']]), index=unl.index)
    outfile = op.join(dest_dir, 'final-%s-stacker=linr-unl.csv' % submission_options)
    save_submission(submission_unl_st, outfile, select_top=None)

    # noinspection PyArgumentList
    submission_scr_st = Series(data=robust_predict(scr[['trees', 'logregs']]), index=scr.index)
    outfile = op.join(dest_dir, 'final-%s-stacker=linr-scr.csv' % submission_options)
    save_submission(submission_scr_st, outfile, select_top=select_top_scr)