train_indices, test_indices = resreg.uniform_test_split(X, y, bins=bins, bin_test_size=70, verbose=False, random_state=rrr) X_train, y_train = X[train_indices, :], y[train_indices] X_test, y_test = X[test_indices, :], y[test_indices] # Unpack hyperparameters, resample training data, and fit regressors reg = DecisionTreeRegressor(random_state=rrr) if 'REBAGG' in strategy else \ RandomForestRegressor(n_estimators=10, n_jobs=-1, random_state=rrr) if strategy == 'RO': cl, ch, sample_method = param relevance = resreg.sigmoid_relevance(y_train, cl=cl, ch=ch) X_train, y_train = resreg.random_oversample(X_train, y_train, relevance, relevance_threshold=0.5, over=sample_method, random_state=rrr) reg.fit(X_train, y_train) elif strategy == 'SMOTER': cl, ch, sample_method, k = param relevance = resreg.sigmoid_relevance(y_train, cl=cl, ch=ch) X_train, y_train = resreg.smoter(X_train, y_train, relevance, relevance_threshold=0.5,
legend_font = {'family': fnt, 'size': '14'} label_font = {'family': fnt, 'size': '18'} title_font = {'family': fnt, 'size': '18'} plt.rcParams['figure.figsize'] = [6, 3.5] params = {'mathtext.default': 'regular'} plt.rcParams.update(params) plt.rc('font', size=12) formatter = ticker.ScalarFormatter(useMathText=True) formatter.set_scientific(True) formatter.set_powerlimits((0, 0)) # One-sided yrange = np.arange(0, 120.1, step=0.5) cl1, ch1 = None, np.percentile(y, 90) rel = resreg.sigmoid_relevance(yrange, cl1, ch1) ax = plt.subplot() ax.plot(yrange, rel, label='Relevance', color='crimson', linewidth=2) #ax.axhline(0.5, linestyle='--', color='grey') ax.hlines(0.5, xmin=0, xmax=120, linestyle='--', color='grey') ax.vlines(72.2, ymin=0, ymax=0.5, linestyle='--', color='grey') ax.text(x=5, y=0.53, s=u't$_{R}$=0.5', fontdict=legend_font) kde = gaussian_kde(y, bw_method=0.4) dens = kde.evaluate(yrange) ax2 = ax.twinx() ax2.plot(yrange, np.ones(len(yrange)) * 99, color='crimson', label='Relevance') # Just for legend ax2.plot(yrange, dens, color='black', label='Density')
aalist = list('ACDEFGHIKLMNPQRSTVWY') def getAAC(seq): aac = np.array([seq.count(x) for x in aalist])/len(seq) return aac data = pd.read_excel('data/sequence_ogt_topt.xlsx', index_col=0) aac = np.array([getAAC(seq) for seq in data['sequence']]) ogt = data['ogt'].values.reshape((data.shape[0],1)) X = np.append(aac, ogt, axis=1) sc = StandardScaler() X = sc.fit_transform(X) y = data['topt'].values # Fit TOMER with Rebagg ensemble to all 2,917 sequences #========================================================# base_reg = DecisionTreeRegressor(random_state=0) rebagg = resreg.Rebagg(m=100, s=600, base_reg=base_reg) relevance = resreg.sigmoid_relevance(y, cl=None, ch=72.2) rebagg.fit(X, y, relevance=relevance, relevance_threshold=0.5, sample_method='random_oversample', size_method='variation', random_state=0) # Save final model #========================# joblib.dump(rebagg, 'results/final_model/tomer_rebagg.pkl') joblib.dump(sc, 'results/final_model/standard_scaler.pkl')
def sum_sigmoid_relevance(y, low_percentile, high_percentile): cl = np.percentile(y, low_percentile) if low_percentile is not None else None ch = np.percentile(y, high_percentile) if high_percentile is not None else None relevance = resreg.sigmoid_relevance(y, cl, ch) return round(np.sum(relevance), 3)