def mols_having_best_feat(penalty='l1', c=1, num_folds=10): df_t3 = task3_res() coefs = [] for cv_seed, group in df_t3.groupby(['cv_seed']): print(cv_seed, len(group)) # Iterate over the 5 cv seeds for the same num_cv_folds: for _, gr in df_t3.result.items(): # average over the different folds # print df_t3.C # print df_t3.num_cv_folds # print df_t3.cv_seed coefs.append(np.mean(np.array([gr.logreg_coefs(i).ravel() for i in range(num_folds)]), axis=0)) av_coefs = np.mean(np.array(coefs), axis=0) index_of_best = np.argmax(av_coefs) mfm = MalariaFingerprintsManager(dset='lab') feat = mfm.i2s(index_of_best) print(feat) # feat = 'n1c(S(C)(=O)=O)sc(N)c1S(c)(=O)=O' molids = mfm.mols_with_feature(feat) mc = MalariaCatalog() mols = mc.molids2mols(molids) labels = mc.molids2labels(molids, as01=True) print(len(mols)) draw_in_a_grid_aligned_according_to_pattern(mols, feat, op.join(MALARIA_EXPS_ROOT, 'logregs', 'Mols_having_best_fpt.png'), legends=molids, classes=labels)
def compute_confirmatory(deployers, molids_provider, outfile, y_provider=None, select_top=500, mc=None): """Scores and rankings on plain-average for the labelled / ambiguous dataset.""" # Labelled Xlab, f_names = deployers(dset='lab') info('AUC after plain averaging (bagging like): %.3f' % roc_auc_score(y_provider(), np.nanmean(Xlab, axis=1))) # Ambiguous Xamb, _ = deployers(dset='amb') # All together X = np.vstack((Xlab, Xamb)) # Scores are just plain averages scores = np.nanmean(X, axis=1) # Get the molids, smiles, labels, pec50 lab_molids = molids_provider(dset='lab') amb_molids = molids_provider(dset='amb') molids = np.hstack((lab_molids, amb_molids)) if mc is None: mc = MalariaCatalog() labels = mc.molids2labels(molids) pec50s = mc.molids2pec50s(molids) smiles = mc.molids2smiless(molids) # Rankings ranks, (sscores, smolids, slabels, spec50s, ssmiles) = \ rank_sort(scores, (scores, molids, labels, pec50s, smiles), reverse=True, select_top=select_top) # N.B. # if analyzing ranking variability, use instead # scores2rankings() # Save for submission with open(outfile, 'w') as writer: for molid, smiles, score in zip(smolids, ssmiles, sscores): writer.write('%s,%s,%.6f\n' % (molid, smiles, score)) # Create and save a pandas series to allow further stacking s = Series(data=scores, index=molids) s.to_pickle(op.join(op.splitext(outfile)[0] + '.pkl')) return molids, scores
def final_merged_submissions(calibrate=False, dest_dir=MALARIA_EXPS_ROOT): """Very ad-hoc merge of submissions obtained with trees and logistic regressors.""" ##### #0 Preparations ##### # Avoid circular imports from ccl_malaria.logregs_fit import MALARIA_LOGREGS_EXPERIMENT_ROOT from ccl_malaria.trees_fit import MALARIA_TREES_EXPERIMENT_ROOT mc = MalariaCatalog() def save_submission(sub, outfile, select_top=500): # Get the smiles smiles = mc.molids2smiless(sub.index) # Rankings ranks, (sscores, smolids, ssmiles) = \ rank_sort(sub.values, (sub.values, sub.index.values, smiles), reverse=True, select_top=select_top) # Save for submission with open(outfile, 'w') as writer: for molid, smiles, score in izip(smolids, ssmiles, sscores): writer.write('%s,%s,%.6f\n' % (molid, smiles, score)) ##### #1 Robust merge using pandas ##### def read_average_merge(root, prefix): hit = pd.read_pickle(op.join(root, '%s_hitSelection.pkl' % prefix)) labels = mc.molids2labels(hit.index, as01=True) lab = hit[~np.isnan(labels)] amb = hit[np.isnan(labels)] unl = pd.read_pickle(op.join(root, '%s_unl-averaged.pkl' % prefix)) scr = pd.read_pickle(op.join(root, '%s_scr-averaged.pkl' % prefix)) return lab, amb, unl, scr tlab, tamb, tunl, tscr = read_average_merge(MALARIA_TREES_EXPERIMENT_ROOT, 'trees') llab, lamb, lunl, lscr = read_average_merge(MALARIA_LOGREGS_EXPERIMENT_ROOT, 'logreg') lab = DataFrame({'trees': tlab, 'logregs': llab}) lab['labels'] = mc.molids2labels(lab.index, as01=True) assert np.sum(np.isnan(lab.labels)) == 0 amb = DataFrame({'trees': tamb, 'logregs': lamb}) unl = DataFrame({'trees': tunl, 'logregs': lunl}) scr = DataFrame({'trees': tscr, 'logregs': lscr}) # ATM we take it easy and just drop any NA lab.dropna(inplace=True) amb.dropna(inplace=True) unl.dropna(inplace=True) scr.dropna(inplace=True) ##### #2 Calibration on labelling - careful with overfitting for hitList, do it in cross-val fashion ##### def calibrate_row(row): calibrator = IsotonicRegression(y_min=0, y_max=1) x = lab[~np.isnan(lab[row])][row].values y = lab[~np.isnan(lab[row])]['labels'].values calibrator.fit(x, y) lab[row] = calibrator.predict(lab[row].values) amb[row] = calibrator.predict(amb[row].values) unl[row] = calibrator.predict(unl[row].values) scr[row] = calibrator.predict(scr[row].values) if calibrate: calibrate_row('trees') calibrate_row('logregs') ##### #3 Average for the submission in lab-amb ##### submission_lab = (lab.trees + lab.logregs) / 2 submission_amb = (amb.trees + amb.logregs) / 2 submission_hts = pd.concat((submission_lab, submission_amb)) outfile = op.join(dest_dir, 'final-merged-%s-hitSelection.csv' % ('calibrated' if calibrate else 'nonCalibrated')) save_submission(submission_hts, outfile) ##### #4 Average predictions for unlabelled ##### submission_unl_avg = (unl.trees + unl.logregs) / 2 outfile = op.join(dest_dir, 'final-%s-avg-unl.csv' % ('calibrated' if calibrate else 'nonCalibrated')) save_submission(submission_unl_avg, outfile, select_top=None) submission_scr_avg = (scr.trees + scr.logregs) / 2 outfile = op.join(dest_dir, 'final-%s-avg-scr.csv' % ('calibrated' if calibrate else 'nonCalibrated')) save_submission(submission_scr_avg, outfile, select_top=1000) ##### #5 Stacked (linear regression) for unlabelled ##### stacker = LinearRegression() stacker.fit(lab[['trees', 'logregs']], lab.labels) submission_unl_st = Series(data=stacker.predict(unl[['trees', 'logregs']]), index=unl.index) outfile = op.join(dest_dir, 'final-%s-stacker=linr-unl.csv' % ('calibrated' if calibrate else 'nonCalibrated')) save_submission(submission_unl_st, outfile, select_top=None) submission_scr_st = Series(data=stacker.predict(scr[['trees', 'logregs']]), index=scr.index) outfile = op.join(dest_dir, 'final-%s-stacker=linr-scr.csv' % ('calibrated' if calibrate else 'nonCalibrated')) save_submission(submission_scr_st, outfile, select_top=1000)
def merge_submissions(calibrate=False, select_top_scr=None, with_bug=False, dest_dir=MALARIA_EXPS_ROOT): """Very ad-hoc merge of submissions obtained with trees and logistic regressors.""" ##### # 0 Preparations ##### # Avoid circular imports from ccl_malaria.logregs_fit import MALARIA_LOGREGS_EXPERIMENT_ROOT from ccl_malaria.logregs_analysis import malaria_logreg_file_prefix from ccl_malaria.trees_fit import MALARIA_TREES_EXPERIMENT_ROOT mc = MalariaCatalog() ensure_dir(dest_dir) def save_submission(sub, outfile, select_top=500): # Get the smiles smiles = mc.molids2smiless(sub.index) # Rankings ranks, (sscores, smolids, ssmiles) = \ rank_sort(sub.values, (sub.values, sub.index.values, smiles), reverse=True, select_top=select_top) # Save for submission with open(outfile, 'w') as writer: for molid, smiles, score in zip(smolids, ssmiles, sscores): writer.write('%s,%s,%.6f\n' % (molid, smiles, score)) ##### # 1 Robust merge using pandas ##### def read_average_merge(root, prefix): hit = pd.read_pickle(op.join(root, '%s_hitSelection.pkl' % prefix)) labels = mc.molids2labels(hit.index, as01=True) lab = hit[~np.isnan(labels)] amb = hit[np.isnan(labels)] unl = pd.read_pickle(op.join(root, '%s_unl-averaged.pkl' % prefix)) scr = pd.read_pickle(op.join(root, '%s_scr-averaged.pkl' % prefix)) return lab, amb, unl, scr tlab, tamb, tunl, tscr = read_average_merge(MALARIA_TREES_EXPERIMENT_ROOT, 'trees') llab, lamb, lunl, lscr = read_average_merge(MALARIA_LOGREGS_EXPERIMENT_ROOT, malaria_logreg_file_prefix(with_bug=with_bug)) lab = DataFrame({'trees': tlab, 'logregs': llab}) lab['labels'] = mc.molids2labels(lab.index, as01=True) assert np.sum(np.isnan(lab['labels'])) == 0 amb = DataFrame({'trees': tamb, 'logregs': lamb}) unl = DataFrame({'trees': tunl, 'logregs': lunl}) scr = DataFrame({'trees': tscr, 'logregs': lscr}) # ATM we take it easy and just drop any NA lab.dropna(inplace=True) amb.dropna(inplace=True) unl.dropna(inplace=True) scr.dropna(inplace=True) ##### # 2 Calibration on labelling - careful with overfitting for hitList, do it in cross-val fashion ##### def calibrate_col(col): # isotonic not the best here, and faces numerical issues calibrator = IsotonicRegression(y_min=0, y_max=1) x = lab[~np.isnan(lab[col])][col].values y = lab[~np.isnan(lab[col])]['labels'].values # This worked with old sklearn try: # Old sklearn calibrator.fit(x.reshape(-1, 1), y) lab[col] = calibrator.predict(lab[col].values.reshape(-1, 1)) amb[col] = calibrator.predict(amb[col].values.reshape(-1, 1)) unl[col] = calibrator.predict(unl[col].values.reshape(-1, 1)) scr[col] = calibrator.predict(scr[col].values.reshape(-1, 1)) except ValueError: # Newer sklearn calibrator.fit(x.ravel(), y) lab[col] = calibrator.predict(lab[col].values.ravel()) amb[col] = calibrator.predict(amb[col].values.ravel()) unl[col] = calibrator.predict(unl[col].values.ravel()) scr[col] = calibrator.predict(scr[col].values.ravel()) if calibrate: calibrate_col('trees') calibrate_col('logregs') ##### # 3 Average for the submission in lab-amb ##### submission_lab = (lab.trees + lab.logregs) / 2 submission_amb = (amb.trees + amb.logregs) / 2 submission_hts = pd.concat((submission_lab, submission_amb)) submission_options = '%s-%s' % ( 'calibrated' if calibrate else 'nonCalibrated', 'lastFold' if with_bug else 'averageFolds') outfile = op.join(dest_dir, 'final-merged-%s-hitSelection.csv' % submission_options) save_submission(submission_hts, outfile) ##### # 4 Average predictions for unlabelled ##### submission_unl_avg = (unl.trees + unl.logregs) / 2 outfile = op.join(dest_dir, 'final-%s-avg-unl.csv' % submission_options) save_submission(submission_unl_avg, outfile, select_top=None) submission_scr_avg = (scr.trees + scr.logregs) / 2 outfile = op.join(dest_dir, 'final-%s-avg-scr.csv' % submission_options) save_submission(submission_scr_avg, outfile, select_top=select_top_scr) ##### # 5 Stacked (linear regression) for unlabelled ##### stacker = LinearRegression() stacker.fit(lab[['trees', 'logregs']], lab['labels']) def robust_predict(X): X = np.asarray(X) row_is_finite = np.all(np.isfinite(X), axis=1) scores = np.full(len(X), fill_value=np.nan) scores[row_is_finite] = stacker.predict(X[row_is_finite]) return scores # noinspection PyArgumentList submission_unl_st = Series(data=robust_predict(unl[['trees', 'logregs']]), index=unl.index) outfile = op.join(dest_dir, 'final-%s-stacker=linr-unl.csv' % submission_options) save_submission(submission_unl_st, outfile, select_top=None) # noinspection PyArgumentList submission_scr_st = Series(data=robust_predict(scr[['trees', 'logregs']]), index=scr.index) outfile = op.join(dest_dir, 'final-%s-stacker=linr-scr.csv' % submission_options) save_submission(submission_scr_st, outfile, select_top=select_top_scr)