Example #1
0
    train_indices, test_indices = resreg.uniform_test_split(X,
                                                            y,
                                                            bins=bins,
                                                            bin_test_size=70,
                                                            verbose=False,
                                                            random_state=rrr)
    X_train, y_train = X[train_indices, :], y[train_indices]
    X_test, y_test = X[test_indices, :], y[test_indices]

    # Unpack hyperparameters, resample training data, and fit regressors
    reg = DecisionTreeRegressor(random_state=rrr) if 'REBAGG' in strategy else \
              RandomForestRegressor(n_estimators=10, n_jobs=-1, random_state=rrr)

    if strategy == 'RO':
        cl, ch, sample_method = param
        relevance = resreg.sigmoid_relevance(y_train, cl=cl, ch=ch)
        X_train, y_train = resreg.random_oversample(X_train,
                                                    y_train,
                                                    relevance,
                                                    relevance_threshold=0.5,
                                                    over=sample_method,
                                                    random_state=rrr)
        reg.fit(X_train, y_train)

    elif strategy == 'SMOTER':
        cl, ch, sample_method, k = param
        relevance = resreg.sigmoid_relevance(y_train, cl=cl, ch=ch)
        X_train, y_train = resreg.smoter(X_train,
                                         y_train,
                                         relevance,
                                         relevance_threshold=0.5,
Example #2
0
legend_font = {'family': fnt, 'size': '14'}
label_font = {'family': fnt, 'size': '18'}
title_font = {'family': fnt, 'size': '18'}
plt.rcParams['figure.figsize'] = [6, 3.5]
params = {'mathtext.default': 'regular'}
plt.rcParams.update(params)
plt.rc('font', size=12)

formatter = ticker.ScalarFormatter(useMathText=True)
formatter.set_scientific(True)
formatter.set_powerlimits((0, 0))

# One-sided
yrange = np.arange(0, 120.1, step=0.5)
cl1, ch1 = None, np.percentile(y, 90)
rel = resreg.sigmoid_relevance(yrange, cl1, ch1)

ax = plt.subplot()
ax.plot(yrange, rel, label='Relevance', color='crimson', linewidth=2)
#ax.axhline(0.5, linestyle='--', color='grey')
ax.hlines(0.5, xmin=0, xmax=120, linestyle='--', color='grey')
ax.vlines(72.2, ymin=0, ymax=0.5, linestyle='--', color='grey')
ax.text(x=5, y=0.53, s=u't$_{R}$=0.5', fontdict=legend_font)

kde = gaussian_kde(y, bw_method=0.4)
dens = kde.evaluate(yrange)
ax2 = ax.twinx()
ax2.plot(yrange, np.ones(len(yrange)) * 99, color='crimson',
         label='Relevance')  # Just for legend
ax2.plot(yrange, dens, color='black', label='Density')
Example #3
0
aalist = list('ACDEFGHIKLMNPQRSTVWY')
def getAAC(seq):
    aac = np.array([seq.count(x) for x in aalist])/len(seq)
    return aac

data = pd.read_excel('data/sequence_ogt_topt.xlsx', index_col=0)
aac = np.array([getAAC(seq) for seq in data['sequence']])
ogt = data['ogt'].values.reshape((data.shape[0],1))
X = np.append(aac, ogt, axis=1)
sc = StandardScaler()
X = sc.fit_transform(X)
y = data['topt'].values




# Fit TOMER with Rebagg ensemble to all 2,917 sequences
#========================================================#
base_reg = DecisionTreeRegressor(random_state=0)
rebagg = resreg.Rebagg(m=100, s=600, base_reg=base_reg)
relevance = resreg.sigmoid_relevance(y, cl=None, ch=72.2)
rebagg.fit(X, y, relevance=relevance, relevance_threshold=0.5, 
           sample_method='random_oversample', size_method='variation', random_state=0)



# Save final model
#========================#
joblib.dump(rebagg, 'results/final_model/tomer_rebagg.pkl')
joblib.dump(sc, 'results/final_model/standard_scaler.pkl')
Example #4
0
def sum_sigmoid_relevance(y, low_percentile, high_percentile):
    cl = np.percentile(y, low_percentile) if low_percentile is not None else None
    ch = np.percentile(y, high_percentile) if high_percentile is not None else None
    relevance = resreg.sigmoid_relevance(y, cl, ch)
    return round(np.sum(relevance), 3)