Ejemplo n.º 1
0
def test_repeated_kfold_determinstic_split():
    X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
    random_state = 258173307
    rkf = RepeatedKFold(
        n_splits=2,
        n_repeats=2,
        random_state=random_state)

    # split should produce same and deterministic splits on
    # each call
    for _ in range(3):
        splits = rkf.split(X)
        train, test = next(splits)
        assert_array_equal(train, [2, 4])
        assert_array_equal(test, [0, 1, 3])

        train, test = next(splits)
        assert_array_equal(train, [0, 1, 3])
        assert_array_equal(test, [2, 4])

        train, test = next(splits)
        assert_array_equal(train, [0, 1])
        assert_array_equal(test, [2, 3, 4])

        train, test = next(splits)
        assert_array_equal(train, [2, 3, 4])
        assert_array_equal(test, [0, 1])

        assert_raises(StopIteration, next, splits)
    kf = RepeatedKFold(n_splits=num_fold, n_repeats=1, random_state=666)

    # Define a loop for plotting figures.
    max_samples_batch = 200
    batch_size = 1

    # Shuffle the dataset.
    combined = list(zip(dataset, strings))
    random.seed(666)
    random.shuffle(combined)
    dataset[:], strings[:] = zip(*combined)

    pool = multiprocessing.Pool(os.cpu_count())
    args = []
    # print(os.cpu_count()) # It counts for logical processors instead of physical cores.
    for train_idx, test_idx in kf.split(dataset):
        tmp_args = {
            'train_idx': train_idx,
            'test_idx': test_idx,
            'dataset': dataset,
            'strings': strings,
            'max_samples_batch': max_samples_batch,
            'batch_size': batch_size,
        }
        args.append(tmp_args)
    results = pool.map(cv_edit_active_learn, args)
    # print(len(results))
    # print(len(results[0]))
    phrase_acc = [results[i][0] for i in range(num_fold)]
    out_acc = [results[i][1] for i in range(num_fold)]
    # print(len(phrase_acc))
                yield filenames, images
                filenames = []
                images = np.zeros(self.batch_shape)
                idx = 0
        if idx > 0:
            yield filenames, images


# k-fold cross-validation
from sklearn.model_selection import RepeatedKFold

splitter = RepeatedKFold(n_splits=3, n_repeats=1, random_state=0)

partitions = []

for train_idx, test_idx in splitter.split(train_labels.index.values):
    partition = {}
    partition["train"] = train_labels.Id.values[train_idx]
    partition["validation"] = train_labels.Id.values[test_idx]
    partitions.append(partition)


# Define the CNN parameters
class ModelParameter:
    def __init__(self,
                 basepath,
                 num_classes=28,
                 image_rows=512,
                 image_cols=512,
                 batch_size=200,
                 n_channels=1,
Ejemplo n.º 4
0
# 数据标准化
x = preprocessing.scale(x)

# 划分数据集(20%测试集)
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle=True)

tt = time.time()

forest = RandomForestClassifier(criterion='entropy')

# 训练:使用k折(cv=k,这里用5折)交叉验证
kf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=0)
kf_score = []
for t, v in kf.split(x_train):
    forest.fit(x_train[t], y_train[t])  # fitting
    val_score = forest.score(x_train[v], y_train[v])
    kf_score.append(val_score)

print('time: {:.5f}s'.format(time.time() - tt))

tt = time.time()
# 测试结果
accuracy_score = forest.score(x_test, y_test)
print('time: {:.5f}s'.format(time.time() - tt))

print('验证集accuracy_score: {:.4f}'.format(np.mean(kf_score)))
print("测试集accuracy_score: {:.4f}".format(accuracy_score))
Ejemplo n.º 5
0
#test
#X = np.array(sorted(set(subdf.index.date)))
#rkf = RepeatedKFold(n_splits=10, n_repeats=1, random_state=12883823)
#for train_index,test_index in rkf.split(X):
#    tmp = subdf[pd.to_datetime(subdf.index.date).isin(X[test_index])]
#    print(tmp['Weekday'].value_counts().sort_index())
    
all_date = np.array(sorted(set(dfPm.index.date)))
rkf = RepeatedKFold(n_splits=10, n_repeats=10, random_state=12883823)
a = 0
dict_of_2019=dict()
dict_of_max2019=dict()
RF1_feature_table = pd.DataFrame(columns = dfPm.keys()[2:]) 
RF2_feature_table = pd.DataFrame(columns = dfPm.keys()[2:])
for train_index, test_index in rkf.split(all_date):
    train_datetime_index = pd.to_datetime(dfPm.index.date).isin(all_date[train_index])
    test_datetime_index = pd.to_datetime(dfPm.index.date).isin(all_date[test_index])
    X_train, X_test = X[train_datetime_index], X[test_datetime_index]
    y_train, y_test = y[train_datetime_index], y[test_datetime_index]
    
    # feature extraction
    model_RF = RandomForestRegressor(n_estimators=100, max_depth=7,random_state=137)
    model_RF = model_RF.fit(X_train, y_train)
    
    new_train = pd.DataFrame(X_train)
    new_train['obs_o3'] = y_train
    new_train['pred_o3'] = model_RF.predict(X_train)
    new_train['diff_o3'] = abs(new_train['pred_o3']-new_train['obs_o3'])
    new_train=new_train[new_train['diff_o3']>5]
    X_train2 = np.array(new_train.drop(['obs_o3','pred_o3','diff_o3'],axis=1))
Ejemplo n.º 6
0
    oof_xgb[val_idx] = clf.predict(xgb.DMatrix(X_train[val_idx]),
                                   ntree_limit=clf.best_ntree_limit)
    predictions_xgb += clf.predict(
        xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit) / folds.n_splits

print("CV score: {:<8.8f}".format(mean_squared_error(oof_xgb, y)))

# 将lgb和xgb的结果进行stacking
train_stack = np.vstack([oof_lgb, oof_xgb]).transpose()
test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose()

folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4590)
oof_stack1 = np.zeros(train_stack.shape[0])
predictions1 = np.zeros(test_stack.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack, y)):
    print("fold {}".format(fold_))
    trn_data, trn_y = train_stack[trn_idx], y.iloc[trn_idx].values
    val_data, val_y = train_stack[val_idx], y.iloc[val_idx].values

    clf_3 = BayesianRidge()
    clf_3.fit(trn_data, trn_y)

    oof_stack1[val_idx] = clf_3.predict(val_data)
    predictions1 += clf_3.predict(test_stack) / 10

print("CV score: {:<8.8f}".format(mean_squared_error(y.values, oof_stack1)))

sub_df = pd.DataFrame()
sub_df[0] = pd.read_csv('./jinnan_round1_testB_20190121.csv',
                        header=None)[0][1:]
def regression_cross_validate_perseverance(data):

    dm = data['DM'][0]
    ccs = []
    ccs_ch = []
    ccs_rew = []

    for s, sess in enumerate(dm):
        DM = dm[s]
        choices = DM[:, 1]
        reward = DM[:, 2]

        block = DM[:, 4]
        block_df = np.diff(block)
        ind_block = np.where(block_df != 0)[0]

        if len(ind_block) >= 11:

            trials_since_block = []
            t = 0

            for st, s in enumerate(block):
                if block[st - 1] != block[st]:
                    t = 0
                else:
                    t += 1
                trials_since_block.append(t)

            #block_totals_ind = (np.where(np.asarray(ind_block) == 1)[0]-1)[1:]
            block_totals_ind = ind_block
            block_totals = np.diff(block_totals_ind) - 1
            trials_since_block = trials_since_block[:ind_block[11]]
            fraction_list = []

            for t, trial in enumerate(trials_since_block):

                if t <= block_totals_ind[0]:
                    fr = trial / block_totals_ind[0]
                    fraction_list.append(fr)

                elif t > block_totals_ind[0] and t <= block_totals_ind[1]:
                    fr = trial / block_totals[0]
                    fraction_list.append(fr)

                elif t > block_totals_ind[1] and t <= block_totals_ind[2]:
                    fr = trial / block_totals[1]
                    fraction_list.append(fr)

                elif t > block_totals_ind[2] and t <= block_totals_ind[3]:
                    fr = trial / block_totals[2]
                    fraction_list.append(fr)

                elif t > block_totals_ind[3] and t <= block_totals_ind[4]:
                    fr = trial / block_totals[3]
                    fraction_list.append(fr)

                elif t > block_totals_ind[4] and t <= block_totals_ind[5]:
                    fr = trial / block_totals[4]
                    fraction_list.append(fr)

                elif t > block_totals_ind[5] and t <= block_totals_ind[6]:
                    fr = trial / block_totals[5]
                    fraction_list.append(fr)

                elif t > block_totals_ind[6] and t <= block_totals_ind[7]:
                    fr = trial / block_totals[6]
                    fraction_list.append(fr)

                elif t > block_totals_ind[7] and t <= block_totals_ind[8]:
                    fr = trial / block_totals[7]
                    fraction_list.append(fr)

                elif t > block_totals_ind[8] and t <= block_totals_ind[9]:
                    fr = trial / block_totals[8]
                    fraction_list.append(fr)

                elif t > block_totals_ind[9] and t <= block_totals_ind[10]:
                    fr = trial / block_totals[9]
                    fraction_list.append(fr)

                elif t > block_totals_ind[10] and t <= len(trials_since_block):
                    fr = trial / trials_since_block[-1]
                    fraction_list.append(fr)

            choices = choices[:ind_block[11]]
            reward = reward[:ind_block[11]]

            last_reward = []

            for r, rew in enumerate(reward):
                if r > 0:
                    if reward[r - 1] == 1:
                        last_reward.append(1)
                    elif reward[r - 1] == 0:
                        last_reward.append(0)

            last_choice = []

            for c, ch in enumerate(choices):
                if c > 0:
                    if choices[r - 1] == 1:
                        last_choice.append(1)
                    elif choices[r - 1] == 0:
                        last_choice.append(0)

            fraction_list = np.asarray(fraction_list)[1:]

            last_reward = np.asarray(last_reward)
            last_choice = np.asarray(last_choice)
            fraction_reward = last_reward * fraction_list
            fraction_choice = last_reward * fraction_list
            trials = len(fraction_choice)

            predictors_all = OrderedDict([
                ('Last Reward', last_reward), ('Last Choice', last_choice),
                ('Block Fraction', fraction_list),
                ('Block Fraction x Choice', fraction_choice),
                ('Block Fraction x Reward', fraction_reward)
            ])

            X = np.vstack(predictors_all.values()).T[:trials, :].astype(float)
            y = choices[1:]
            kf = RepeatedKFold(n_splits=5, n_repeats=2,
                               random_state=99)  #initialise repeated K-Fold

            ccx = []
            ccx_ch = []
            ccx_rew = [
            ]  #initialise contained for storing cross validated fits

            for train_ix, test_ix in kf.split(y):

                y_train = y[train_ix]
                y_test = y[test_ix]  #get train and test indices for activity

                #get train and test DM without time in block interactions
                x_train_no_choice_int = X[:, 0][train_ix]
                x_test_no_choice_int = X[:, 0][test_ix]

                #get train and test DM with time in block interactions
                x_train_choice_int = X[:, 1][train_ix]
                x_test_choice_int = X[:, 1][test_ix]
                x_train_rew_int = X[train_ix, 2:3]
                x_test_rew_int = X[test_ix, 2:3]

                #fit linear model with regularisation. Ideally would do nested K-fold
                #to select optimal hyper-parameter
                linR = lm.LogisticRegression(fit_intercept=True)
                ft = linR.fit(x_train_no_choice_int, y_train)

                ccx.append(
                    np.corrcoef(ft.predict(x_test_no_choice_int),
                                y_test)[0,
                                        1])  #get cross validated fit quality

                linR = lm.LogisticRegression(fit_intercept=True)

                ft = linR.fit(x_train_choice_int, y_train)
                ccx_ch.append(
                    np.corrcoef(ft.predict(x_test_choice_int),
                                y_test)[0,
                                        1])  #get cross validated fit quality

                linR = lm.LogisticRegression(fit_intercept=True)

                ft = linR.fit(x_train_rew_int, y_train)
                ccx_rew.append(
                    np.corrcoef(ft.predict(x_test_rew_int),
                                y_test)[0,
                                        1])  #get cross validated fit quality

            ccs.append(np.nanmean(ccx))
            ccs_ch.append(np.nanmean(ccx_ch))
            ccs_rew.append(np.nanmean(ccx_rew))

    c1 = np.array(ccs)**2
    c2 = np.array(ccs_ch)**2

    ixs = np.logical_and.reduce([np.isfinite(c1), np.isfinite(c2)])
    t, p = stt.ttest_rel(c1[ixs], c2[ixs])
    print(
        'Variance explained \nwithout time in block: {:.5f}\nwith time in block: {:.5f}'
        .format(np.nanmean(c1), np.nanmean(c2)))
    print('t:{:.3f}\np:{:.3e}'.format(t, p))
Ejemplo n.º 8
0
def yacht():
    df = pd.read_table(f'{datasets_folder}/yacht.txt', sep='\s+', header=None)
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    cv = RepeatedKFold(n_splits=10, n_repeats=4)
    return X, y, cv.split(X)
Ejemplo n.º 9
0
from sklearn.model_selection import RepeatedKFold
from sklearn import svm, metrics
from scipy import stats
import pandas as pd
import numpy
import training

training.createdata()
dataset = pd.read_csv('dir/hog.csv')
dataset = dataset[(numpy.abs(stats.zscore(dataset)) < 5.04).all(axis=1)]
random_state = 12883823
rkf = RepeatedKFold(n_splits=5, n_repeats=30, random_state=random_state)
result = next(rkf.split(dataset), None)

data_train = dataset.iloc[result[0]]
data_test = dataset.iloc[result[1]]

data = data_train.iloc[:, [0, 3780]]
target = data_train.iloc[:, [3781]]

classifier = svm.SVC(C=1, gamma=0.1)
classifier.fit(data, target)

dataset_teste = pd.read_csv('dir/test_hog.csv')

predicted = classifier.predict(dataset_teste.iloc[:, [0, 3780]])
print(metrics.classification_report(dataset_teste.iloc[:, [3781]], predicted))
print("Confusion matrix:\n%s" %
      metrics.confusion_matrix(dataset_teste.iloc[:, [3781]], predicted))
print(
    classifier.score(dataset_teste.iloc[:, [0, 3780]],
def analyze_dataset(d_seq, sample_diam, flag):

    # remove all cases with no tumor cells in the sampled tile
    mask = y == -1
    y = y[~mask]
    X = X[~mask, :]

    X = np.load(STORE_DIR + "data_x.npy")
    y = np.load(STORE_DIR + "data_y.npy")
    print X.shape, y.shape

    # set aside holdout set here

    # feature_names = ["".join(["f", str(x)]) for x in range(X.shape[1])]
    # feature_names.append('y')
    # feature_names
    # tmp = pd.DataFrame(np.hstack((X, y.reshape(-1,1))))
    # tmp.columns = feature_names
    # tmp.to_csv('local_data.csv', index=False)

    # add logit transformed response variable
    dtr = learning.VectorTransform(y)
    yt = dtr.zero_one_scale().apply('logit')
    plt.hist(yt)

    plt.hist(y)

    plt.hist(np.sqrt(y))
    plt.scatter(X[:, 12], y)

    # for i in range(30):
    #
    #     p = int(i / 5)
    #     r = i % 5
    #     plt.hist(X[:,i])
    #
    #     plt.title(phens[p] + '_' + str(diams[r]))
    #     plt.show()

    # from sklearn.feature_selection import mutual_info_regression
    # y_noise = y + np.random.normal(scale=0.01, size=(len(y)))
    # for i in range(6):
    #     print "Phenotype ", i
    #     mi = mutual_info_regression(X[:,i].reshape(-1, 1), y.reshape(-1, 1))
    #     print "MI: ", mi
    #     # display.scatter_hist(X[:,i], y)
    #     # plt.scatter(X[:,i], y_noise, s=0.3)
    #     print "Corr: ", helper.metrics.corr(X[:, i], y)
    #     # plt.show()

    # X_train, X_test, y_train, y_test = train_test_split(X, discrete_response,
    #                                                     test_size=0.4)
    # from sklearn.linear_model import LassoCV
    # # from sklearn.neural_network import MLPClassifier
    # # from sklearn.ensemble import AdaBoostClassifier
    # # from sklearn.tree import DecisionTreeClassifier
    # # rf = MLPClassifier(solver='lbfgs', alpha=1e-5,
    # #                     hidden_layer_sizes=(300, 2))

    # from sklearn.ensemble import ExtraTreesClassifier
    # from sklearn.ensemble import RandomForestRegressor
    # from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import RandomForestRegressor
    # rf = ExtraTreesClassifier(n_estimators=500, class_weight='balanced', oob_score=True, bootstrap=True)
    # rf = RandomForestRegressor(n_estimators=500, oob_score=True, bootstrap=True)
    # rf = LassoCV(cv=10, normalize=False)
    #### fit machine learning models ####

    from sklearn.linear_model import LassoCV
    from sklearn.linear_model import Lasso
    from sklearn.linear_model import RidgeCV
    from sklearn.linear_model import Ridge

    from sklearn.linear_model import ElasticNetCV
    from sklearn.linear_model import LinearRegression
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVR
    from sklearn.preprocessing import StandardScaler
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.preprocessing import MaxAbsScaler

    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import RepeatedKFold, GroupKFold
    ################################################################

    X_ = X
    y_ = y

    from sklearn.preprocessing import PolynomialFeatures
    poly = PolynomialFeatures(2)
    X_p = np.hstack(((X + 1), 1 / (X + 1)))
    X_p = poly.fit_transform(X_p)
    X_p = np.sqrt(X_p)
    X_p.shape
    X_ = X_p

    plt.scatter(X_[:, 4], y_)

    rep_scores = []
    estimator = RidgeCV()
    estimator = RandomForestRegressor(n_estimators=50)
    cv = RepeatedKFold(n_splits=10, n_repeats=1)
    out_sample = {'pred': [], 'target': []}
    for train, test in cv.split(X_, y_):

        X_train = X_[train]
        X_test = X_[test]
        y_train = y_[train]
        y_test = y_[test]

        # X_train = np.sqrt(X_train + 1)
        # X_test = np.sqrt(X_test + 1)

        scale = StandardScaler()
        X_train = scale.fit_transform(X_train)
        X_test = scale.transform(X_test)

        # X_train = pca.fit_transform(X_train)
        # X_test = pca.transform(X_test)

        preds = estimator.fit(X_train, y_train).predict(X_test)
        # preds = dtr.undo(preds)
        # y_test = dtr.undo(y_test)

        rep_scores.append(metrics.rmse(preds, y_test))
        # rep_scores.append(estimator.score(X_test, y_test))

        out_sample['pred'].extend(preds)
        out_sample['target'].extend(y_test)

    print np.mean(rep_scores), np.std(rep_scores)

    plt.scatter(preds, y_test)

    # dict elements from list to array
    for key, value in out_sample.iteritems():
        out_sample[key] = np.array(value)

    metrics.rmse(out_sample['pred'], out_sample['target'])
    fig = plt.scatter(out_sample['pred'], out_sample['target'])

    np.sqrt(np.mean(out_sample['target']**2))

    y_test
    plt.hist(yt)

    ################################################################

    # # rf = AdaBoostClassifier(n_estimators=500)
    # # rf = LogisticRegression(penalty='l2')
    # estimator = LogisticRegression(max_iter=1000)
    # estimator = RandomForestClassifier(n_estimators=500, class_weight='balanced_subsample', oob_score=True)
    estimator = RandomForestRegressor(n_estimators=300,
                                      oob_score=True,
                                      bootstrap=True)

    learner = learning.TestLearner(task='regress')
    learner.test(estimator, X, y, folds=5, n_classes=2, rf_oob=True)

    print "proportion of 0:", sum(y == 0) / len(y)
    print "proportion of 1:", sum(y == 1) / len(y)

    estimator.fit(X, y)
    print estimator.feature_importances_

    # tmp = estimator.feature_importances_[::-1].reshape(6,5)

    def adjust_missing_feature_importances(importances, flag, n_outer,
                                           n_inner):
        rings = n_outer + n_inner
        if flag == 'n':
            return importances.reshape(6, rings)
        if flag == 'a':
            tmp = np.insert(importances, n_outer, n_inner * [0])
            tmp = np.insert(tmp, 0, n_inner * [0])
            return tmp.reshape(7, rings)

    # tmp = estimator.feature_importances_.reshape(6,5)

    from visualize_disc_importance import plot_discs, infer_sign_array

    signs = infer_sign_array(X, y)

    n_outer = np.sum(np.array(d_seq) > sample_diam)
    n_inner = len(d_seq) - n_outer
Ejemplo n.º 11
0
    ratios = [5.0, 3.3, 2.5]
    for ratio in ratios:

        P_opt_file = open(
            subject + "/P-Pot-Average-Value-" + str(ratio) + ".csv", "w")
        Statistical_file = open(
            subject + "/STATISTICAL-" + str(ratio) + ".csv", "w")
        P_opt_betweenness_list = []
        P_opt_pagerank_list = []
        P_opt_degree_list = []
        P_opt_effort_list = []
        P_opt_effortcore_list = []

        kf = RepeatedKFold(n_splits=3, n_repeats=10, random_state=0)
        for train_index, test_index in kf.split(all_data):
            try:
                data_train = all_data[train_index]
                data_test = all_data[test_index]
                label_train = all_label[train_index]
                label_test = all_label[test_index]

                test_class_name = []
                for each_index in test_index:
                    test_class_name.append(class_name_list[each_index])

                if (label_sum(label_train) > (len(label_train) / 2)):
                    print "The training data does not need balance."
                    predprob_auc, predprob, precision, recall, fmeasure, auc = classifier_output(
                        data_train,
                        label_train,
Ejemplo n.º 12
0
oof_cb = np.array(pd.read_csv('cab_train.csv')['price'])

# 读取price,对验证集进行评估
Train_data = pd.read_csv('train_tree.csv', sep=' ')
TestA_data = pd.read_csv('text_tree.csv', sep=' ')
Y_data = Train_data['price']

train_stack = np.vstack([oof_lgb, oof_cb]).transpose()
test_stack = np.vstack([predictions_lgb, predictions_cb]).transpose()
folds_stack = RepeatedKFold(n_splits=10, n_repeats=2, random_state=2018)
tree_stack = np.zeros(train_stack.shape[0])
predictions = np.zeros(test_stack.shape[0])

# 二层贝叶斯回归stack
for fold_, (trn_idx,
            val_idx) in enumerate(folds_stack.split(train_stack, Y_data)):
    print("fold {}".format(fold_))
    trn_data, trn_y = train_stack[trn_idx], Y_data[trn_idx]
    val_data, val_y = train_stack[val_idx], Y_data[val_idx]

    Bayes = linear_model.BayesianRidge()
    Bayes.fit(trn_data, trn_y)
    tree_stack[val_idx] = Bayes.predict(val_data)
    predictions += Bayes.predict(test_stack) / 20

tree_predictions = np.expm1(predictions)
tree_stack = np.expm1(tree_stack)
tree_point = mean_absolute_error(tree_stack, np.expm1(Y_data))
print("树模型:二层贝叶斯: {:<8.8f}".format(tree_point))

# 导入神经网络模型预测训练集数据,进行三层融合
Ejemplo n.º 13
0
def main(argv=None):
    if (len(sys.argv) < 2):
        print_syntax()
    inputFile = None
    modelFile = None
    gamma = 0.1
    c = 1.0
    degree = 1
    kernel = "linear"
    param_index = 1
    while param_index < len(sys.argv):
        if (sys.argv[param_index] == "-i"):
            param_index = param_index + 1
            inputFile = sys.argv[param_index]
        elif (sys.argv[param_index] == "-o"):
            param_index = param_index + 1
            modelFile = sys.argv[param_index]
        elif (sys.argv[param_index] == "-m"):
            param_index = param_index + 1
            modelFile = sys.argv[param_index]
        elif (sys.argv[param_index] == "-g"):
            param_index = param_index + 1
            gamma = float(sys.argv[param_index])
        elif (sys.argv[param_index] == "-c"):
            param_index = param_index + 1
            c = float(sys.argv[param_index])
        elif (sys.argv[param_index] == "-d"):
            param_index = param_index + 1
            degree = int(sys.argv[param_index])
        elif (sys.argv[param_index] == "-t"):
            param_index = param_index + 1
            if sys.argv[param_index] == "0":
                kernel = "linear"
            elif sys.argv[param_index] == "1":
                kernel = "poly"
            elif sys.argv[param_index] == "2":
                kernel = "rbf"
            else:
                print_syntax()
        else:
            print("Unknown parameter: ", sys.argv[param_index])
            print_syntax()
        param_index = param_index + 1
    lines = None
    with open(inputFile) as f:
        lines = [line.rstrip() for line in f]
    split_lines = [None] * len(lines)
    max_index = 0
    valid_lines = 0
    for i in range(0, len(lines)):
        fields = lines[i].split(" ")
        split_lines[i] = fields
        if (len(fields) > 1):
            current_maxindex = int(
                fields[len(fields) - 1][:fields[len(fields) - 1].find(":")])
            if (current_maxindex > max_index):
                max_index = current_maxindex
            valid_lines = valid_lines + 1

    matrix = []
    label = []
    for fields in split_lines:
        if (len(fields) > 1):
            data = [0.0 for x in range(max_index)]
            label.append(int(fields[0]))
            for i in range(1, len(fields)):
                data[int(fields[i][:fields[i].find(":")]) - 1] = float(
                    fields[i][(fields[i].find(":")) + 1:])
            matrix.append(data)

    model = svm.SVC(kernel=kernel, gamma=gamma, C=c)

    random_state = 12883824
    rkf = RepeatedKFold(n_splits=len(matrix),
                        n_repeats=1,
                        random_state=random_state)

    pred = [0 for x in range(len(matrix))]

    for train_index, test_index in rkf.split(matrix):
        X_train = [[0 for x in range(max_index)]
                   for y in range(len(matrix) - 1)]
        X_test = [[0 for x in range(max_index)] for y in range(1)]
        label_train = [0 for x in range(len(matrix) - 1)]
        label_test = [0 for x in range(1)]

        y = 0
        for i in train_index:
            for x in range(max_index):
                X_train[y][x] = matrix[i][x]
            label_train[y] = label[i]
            y = y + 1
        y = 0
        for i in test_index:
            for x in range(max_index):
                X_test[y][x] = matrix[i][x]
            label_test[y] = label[i]
            y = y + 1
        model.fit(X_train, label_train)
        res = model.predict(X_test)
        pred[test_index[0]] = res[0]

    relevant = 0
    for i in range(len(label)):
        if (label[i] == 1):
            relevant = relevant + 1
    relevant_and_retrieved = 0
    retrieved = 0
    for i in range(len(pred)):
        if (pred[i] == 1):
            retrieved = retrieved + 1
        if ((pred[i] == 1) and (label[i] == 1)):
            relevant_and_retrieved = relevant_and_retrieved + 1
    recall = relevant_and_retrieved / relevant
    precision = 0
    if (retrieved > 0):
        precision = relevant_and_retrieved / retrieved

    print("Mean-Recall " + repr(recall))
    print("Mean-Precision " + repr(precision))
Ejemplo n.º 14
0
resultados = []
# Lista de resultados onde será calculada a acuracia
X_treino, X_val, y_treino, y_val = train_test_split(X, y, test_size=0.6)
# função train_test_split é usada para dividir nossos dados de maneira padroni-
# zada, assim como foi passado 60% para teste e 40% para treino


KFold = RepeatedKFold(n_splits=2, n_repeats=10, random_state=10)
# função que faz a divisão para encontrar a divisão para treino e validação
# (ou teste) para poder encontrar acuracia do modelo, além de repetir essas
# divições

# laço dedicado para nos dizer aleatoriamente quais linhas devemos usar do
# treino e da validação
for linhas_treino, linhas_val in KFold.split(X):
    
    print("Treino:", linhas_treino[0])
    print("Valid:", linhas_val.shape[0])
    print()
    
    X_treino = X.iloc[linhas_treino]
    X_val = X.iloc[linhas_val]
    y_treino = y.iloc[linhas_treino]
    y_val = y.iloc[linhas_val]
    
    print(X_treino.head())
    print()
    
    Floresta.fit(X_treino, y_treino)
# funçao.fit é a função usada para treinar o modelo para chegar em uma previsão
Ejemplo n.º 15
0
# Set random state here
random_state = 0
# Train test split, save 20% of data point to the test set
X_train, X_test, y_train, y_test, X_before_train, X_before_test = train_test_split(X, y, X_before_scaling, test_size=0.2, random_state = random_state)
                    
                    
# The alpha grid used for plotting path
alphas_grid = np.logspace(0, -3, 20)

# Cross-validation scheme                                  
rkf = RepeatedKFold(n_splits = 10, n_repeats = 10 , random_state =random_state)

# Explicitly take out the train/test set
X_cv_train, y_cv_train, X_cv_test, y_cv_test = [],[],[],[]

for train_index, test_index in rkf.split(X_train):
    X_cv_train.append(X_train[train_index])
    y_cv_train.append(y_train[train_index])
    X_cv_test.append(X_train[test_index])
    y_cv_test.append(y_train[test_index])
    

# %% [markdown]
# ### Step 5 - Train ML models
# %% [markdown]
# #### LASSO Regression<a name="lasso"></a>
# 

# %%
#%% LASSO regression
'''   
Ejemplo n.º 16
0
    def cv_baggingDT(self,
                     pu_data,
                     splits=3,
                     repeats=100,
                     bags=100,
                     filename=''):
        """
        Train bagged decision tree base classifiers and do repeated 
        k-fold CV.

        Synthesizability scores (0 = not synthesizable, 1 = already
        synthesized) are generated for an unlabeled sample by averaging
        the scores from the ensemble of decision tree classifiers that
        have not been trained on that sample. 

        Args:
            pu_data (json): A file of numeric features describing materials. There MUST be a column called "PU_label" where a 1 value indicates a synthesized (positive) compound and a 0 value indicates an unlabeled compound.

            splits (int): Number of splits in k-fold CV.
            repeats (int): Number of repeated k-fold CV.
            bags (int): Number of bags in bootstrap aggregation.
            filename (string): Save model training results to file with
                filename ending in .json or .pkl.

        Returns:
            pu_stats (dict): Metrics and outputs of PU learning model
                training.

        """
        print('Start PU Learning.')

        # Preprocess data and set attributes
        df = pd.read_json(pu_data)
        df_P, df_U, X_P, X_U = self._process_pu_data(df)
        self.df_P = df_P
        self.df_U = df_U

        # Split data into training and test splits for k-fold CV
        kfold = RepeatedKFold(n_splits=splits,
                              n_repeats=repeats,
                              random_state=42)

        # Scores for PU learning (tpr = True Positive Rate)
        scores = []
        tprs = []

        # Predicted synthesis probabilty of CVed P and U sets
        prob_P = np.ones(shape=(X_P.shape[0], splits * repeats))
        prob_U = -np.ones(shape=(X_U.shape[0], splits * repeats))

        # Feature importance
        feat_rank = np.zeros(shape=(X_P.shape[1], splits * repeats))

        idsp = 0  # index of repeated k splits

        # Loop over P and U training/test samples
        for (ptrain, ptest), (utrain, utest) in zip(kfold.split(X_P),
                                                    kfold.split(X_U)):

            # Number of P and U training samples
            N_ptrain = X_P[ptrain].shape[0]
            N_utrain = X_U[utrain].shape[0]

            d = X_P.shape[1]
            K = N_ptrain
            train_label = np.zeros(shape=(N_ptrain + K, ))
            train_label[:N_ptrain] = 1.0  # Synthesized (positive)

            # Out of bag samples
            n_oob = np.zeros(shape=(N_utrain, ))
            f_oob = np.zeros(shape=(N_utrain, 2))

            # Sums of probabilities of test sets
            f_ptest = np.zeros(shape=(X_P[ptest].shape[0], 2))
            f_utest = np.zeros(shape=(X_U[utest].shape[0], 2))

            # Bootstrap resampling for each bag
            for i in range(bags):
                bootstrap_sample = np.random.choice(np.arange(N_utrain),
                                                    replace=True,
                                                    size=K)

                # Positive samples and bootstrapped unlabeled samples
                data_bootstrap = np.concatenate(
                    (X_P[ptrain], X_U[bootstrap_sample, :]), axis=0)

                # Train decision tree classifier
                model = DecisionTreeClassifier(max_depth=None,
                                               max_features=None,
                                               criterion='gini',
                                               class_weight='balanced')

                model.fit(data_bootstrap, train_label)

                # Index for the oob samples
                idx_oob = sorted(
                    set(range(N_utrain)) - set(np.unique(bootstrap_sample)))

                # Transductive learning on oob samples
                f_oob[idx_oob] += model.predict_proba(X_U[utrain][idx_oob])
                n_oob[idx_oob] += 1
                f_ptest += model.predict_proba(X_P[ptest])
                f_utest += model.predict_proba(X_U[utest])
                feat_rank[:, idsp] = model.feature_importances_

            # Predicted synthesis probabilities of unlabeled samples
            predict_utrain = f_oob[:, 1] / n_oob

            # Predicted probabilities for P and U test sets
            predict_ptest = f_ptest[:, 1] / bags
            predict_utest = f_utest[:, 1] / bags

            # Find predicted positives
            true_pos = predict_ptest[np.where(predict_ptest > 0.5)].shape[0]
            u_pos = predict_utest[np.where(predict_utest > 0.5)].shape[0]

            N_ptest = X_P[ptest].shape[0]
            N_utest = X_U[utest].shape[0]

            # Predicted positive ratio in test set
            p_pred_pos = (true_pos + u_pos) / (N_ptest + N_utest) + 0.0001

            # Compute PU recall (TPR) and score metrics
            recall = true_pos / N_ptest
            score = recall**2 / p_pred_pos
            scores.append(score)
            tprs.append(recall)

            # Predicted probabilities
            prob_P[ptest, idsp] = predict_ptest
            prob_U[utrain, idsp] = predict_utrain
            prob_U[utest, idsp] = predict_utest
            idsp += 1

            # Progress update
            if (idsp + 1) % splits == 0:
                tpr_tmp = np.asarray(tprs[-splits - 1:-1])
                print("Performed Repeated " + str(splits) + "-fold: " +
                      str(idsp // splits + 1) + " out of " + str(repeats))
                print("True Positive Rate: %0.2f (+/- %0.2f)" %
                      (tpr_tmp.mean(), tpr_tmp.std() * 2))

        # Predicted labels from k-fold CV
        label_U = np.zeros(shape=(X_U.shape[0], splits * repeats + 1),
                           dtype=int)
        label_U[:, :splits * repeats][np.where(prob_U > 0.5)] = 1
        label_U[:,
                splits * repeats] = np.sum(label_U[:, :splits * repeats + 1],
                                           axis=1)

        tprs = np.asarray(tprs)
        scores = np.asarray(scores)

        # Metrics for each model in the k-folds
        label_U_rp = np.zeros(shape=(X_U.shape[0], repeats), dtype=int)
        prob_U_rp = np.zeros(shape=(X_U.shape[0], repeats))
        feat_rank_rp = np.zeros(shape=(X_U.shape[1], repeats))
        tpr_rp = np.zeros(shape=(repeats, ))
        scores_rp = np.zeros(shape=(repeats, ))
        labels = np.zeros(shape=(X_U.shape[0], ))

        for i in range(repeats):
            prob_U_rp[:, i] = prob_U[:,
                                     i * splits:(i + 1) * splits].mean(axis=1)
            feat_rank_rp[:, i] = feat_rank[:, i * splits:(i + 1) *
                                           splits].mean(axis=1)
            tpr_rp[i] = tprs[i * splits:(i + 1) * splits].mean()
            scores_rp[i] = scores[i * splits:(i + 1) * splits].mean()

        label_U_rp[np.where(prob_U_rp > 0.5)] = 1
        prob = prob_U_rp.mean(axis=1)
        labels[np.where(prob > 0.5)] = 1

        # Get confidence interval of TPR for each kfold
        tpr_low, tpr_up = self.bootstrapCI(tpr_rp)
        scores_low, scores_up = self.bootstrapCI(scores_rp)

        # PU learning metrics
        metrics = np.asarray([
            tpr_rp.mean(), tpr_low, tpr_up,
            scores_rp.mean(), scores_low, scores_up
        ])

        print("Accuracy: %0.2f" % (tpr_rp.mean()))
        print("95%% confidence interval: [%0.2f, %0.2f]" % (tpr_low, tpr_up))

        # Metrics and results from training / testing
        pu_stats = {
            'prob': prob,
            'labels': labels,
            'metrics': metrics,
            'prob_rp': prob_U_rp,
            'label_rp': label_U_rp,
            'tpr_rp': tpr_rp,
            'scores_rp': scores_rp,
            'feat_rank_rp': feat_rank_rp
        }

        # Save results
        if filename:
            if filename.endswith(".json"):
                dumpfn(pu_stats, filename)
            if filename.endswith(".pkl"):
                with open(filename, 'wb') as file:
                    pickle.dump(pu_stats,
                                file,
                                protocol=pickle.HIGHEST_PROTOCOL)

        self.pu_stats = pu_stats
        return pu_stats
Ejemplo n.º 17
0
def pls_train(groups, varname='valence', arrayname='norm', scale=True,
              ncomps=2, cv_folds=None, cv_repeats=None, skip_cv=False,
              xmin=-np.inf, xmax=np.inf, _larch=None, **kws):

    """use a list of data groups to train a Partial Least Squares model

    Arguments
    ---------
      groups      list of groups to use as components
      varname     name of characteristic value to model ['valence']
      arrayname   string of array name to be fit (see Note 3) ['norm']
      xmin        x-value for start of fit range [-inf]
      xmax        x-value for end of fit range [+inf]
      scale       bool to scale data [True]
      cv_folds    None or number of Cross-Validation folds (Seee Note 4) [None]
      cv_repeats  None or number of Cross-Validation repeats (Seee Note 4) [None]
      skip_cv     bool to skip doing Cross-Validation [None]
      ncomps      number of independent components  (See Note 5) [2]

    Returns
    -------
      group with trained PSLResgession, to be used with pls_predict

    Notes
    -----
     1.  The group members for the components must match each other
         in data content and array names.
     2.  all grouops must have an attribute (scalar value) for `varname`
     3.  arrayname can be one of `norm` or `dmude`
     4.  Cross-Validation:  if cv_folds is None, sqrt(len(groups)) will be used
            (rounded to integer).  if cv_repeats is None, sqrt(len(groups))-1
            will be used (rounded).
     5.  The optimal number of components may be best found from PCA. If set to None,
         a search will be done for ncomps that gives the lowest RMSE_CV.
    """
    xdat, spectra = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax)
    groupnames = []
    ydat = []
    for g in groups:
        groupnames.append(getattr(g, 'filename',
                                  getattr(g, 'groupname', repr(g))))
        val = getattr(g, varname, None)
        if val is None:
            raise Value("group '%s' does not have attribute '%s'" % (g, varname))
        ydat.append(val)
    ydat = np.array(ydat)

    nvals = len(groups)

    kws['scale'] = scale
    kws['n_components'] = ncomps

    model = PLSRegression(**kws)

    rmse_cv = None
    if not skip_cv:
        if cv_folds is None:
            cv_folds = int(round(np.sqrt(nvals)))
        if  cv_repeats is None:
            cv_repeats = int(round(np.sqrt(nvals)) - 1)

        resid = []
        cv = RepeatedKFold(n_splits=cv_folds, n_repeats=cv_repeats)
        for ctrain, ctest in cv.split(range(nvals)):
            model.fit(spectra[ctrain, :], ydat[ctrain])
            ypred = model.predict(spectra[ctest, :])[:, 0]
            resid.extend((ypred - ydat[ctest]).tolist())
        resid = np.array(resid)
        rmse_cv = np.sqrt( (resid**2).mean() )

    # final fit without cross-validation
    model = PLSRegression(**kws)
    out = model.fit(spectra, ydat)

    ypred = model.predict(spectra)[:, 0]

    rmse = np.sqrt(((ydat - ypred)**2).mean())

    return Group(x=xdat, spectra=spectra, ydat=ydat, ypred=ypred,
                 coefs=model.x_weights_, loadings=model.x_loadings_,
                 cv_folds=cv_folds, cv_repeats=cv_repeats, rmse_cv=rmse_cv,
                 rmse=rmse, model=model, varname=varname,
                 arrayname=arrayname, scale=scale, groupnames=groupnames,
                 keywords=kws)
Ejemplo n.º 18
0
def train():

    # net = Net()
    print(net)

    # 十折交叉验证, 重复十次
    kf = RepeatedKFold(n_splits=10,
                       n_repeats=10,
                       random_state=int(time.time()))

    # data
    data = np.genfromtxt('5.csv', delimiter=',')
    X = data[:, :-1]
    Y = data[:, -1]

    optimizer = torch.optim.Adam(net.parameters(), lr=0.00001)
    loss_func = nn.CrossEntropyLoss()

    validate_loss_final = 0.0

    for train_index, test_index in kf.split(X):

        X_train = X[train_index]
        X_validate = X[test_index]
        Y_train = Y[train_index]
        Y_validate = Y[test_index]

        train_dataset = InsuranceDataSet(X_train, Y_train)
        train_loader = DataLoader(train_dataset,
                                  shuffle=True,
                                  batch_size=BATCH_SIZE)

        for epoch in range(30):

            # train
            net.train()
            for i, train_data in enumerate(train_loader, 0):
                features, label = train_data
                features = Variable(features)
                label = Variable(label)

                prediction = net(features)

                # print('output size is {}'.format(prediction.size()))

                loss = loss_func(prediction, label)

                # 优化及反向传播
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            # 验证
            net.eval()

            validate_features = Variable(
                torch.from_numpy(X_validate).type(torch.FloatTensor))
            validate_predictions = net(validate_features).detach().numpy()

            result = validate_predictions[:, 0] - validate_predictions[:, 1]
            result_bool = result < 0
            result_bool = result_bool.astype('int')

            validate_loss = f1_score(Y_validate, result_bool)

            print('f1 is {}'.format(validate_loss))
            if epoch == 0 or validate_loss > validate_loss_final:
                torch.save(net.state_dict(), 'Net-round-{}.pth'.format(epoch))
                validate_loss_final = validate_loss

        print('Finish training...')
        print('best is {}'.format(validate_loss_final))
        torch.save(net.state_dict(), 'Net.pth')
Ejemplo n.º 19
0
import numpy as np
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.model_selection import cross_val_score, train_test_split, ShuffleSplit, KFold, RepeatedKFold

X = ["a", "b", "c", "d", "d", "e", "h", "er", "erer", "342"]
kf = RepeatedKFold(n_splits=4, n_repeats=3)

for train, test in kf.split(X):
    print("%s %s" % (train, test))

y = np.random.random(10)
train_test_split(X, y, stratify=y, test_size=0.3, shuffle=True)
               epsilon=.1,
               coef0=1)

# #############################################################################
# Look at the results
lw = 2

svrs = [svr_rbf, svr_lin, svr_poly]
kernel_label = ['RBF', 'Linear', 'Polynomial']
model_color = ['m', 'c', 'g']

# 3-fold train data
kf = RepeatedKFold(n_splits=5)
x_np_array, y_np_array = X.to_numpy(), y.to_numpy()
index = np.arange(0, SAMPLE_LENGTH)
for train_index, test_index in kf.split(X):
    train_x, train_y = x_np_array[train_index], y_np_array[train_index]
    test_x, test_y = x_np_array[test_index], y_np_array[test_index]
    clf = svr_lin.fit(train_x, train_y)

    mae_in_train = mean_absolute_error(svr_lin.predict(train_x), train_y)
    mae_in_test = mean_absolute_error(svr_lin.predict(test_x), test_y)
    r2_score_in_train = r2_score(svr_lin.predict(train_x), train_y)
    # r2_score_in_test = r2_score(svr_lin.predict(test_x),test_y)
    print(mae_in_train, mae_in_test)
    plt.plot(index, y, label="Real surface roughness")
    plt.scatter(train_index,
                svr_lin.predict(train_x),
                facecolor="none",
                edgecolor="k",
                label="SF in train dataset")
Ejemplo n.º 21
0
    def fit(self,
            X,
            y,
            labels=None,
            dist=None,
            importance_weights=None,
            cv_indices=None,
            dist_savename=None):
        t = time.time()

        if y.ndim < 2:
            y = y.reshape(-1, 1)

        if self.n_components is not None:
            if self.verbose > 0:
                elapsed = time.time() - t
                print('PCA [%dmin %dsec]' %
                      (int(elapsed / 60), int(elapsed % 60)))
            sys.stdout.flush()
            self.pca = PCA(n_components=self.n_components, svd_solver='arpack')
            y_ = self.pca.fit_transform(y)
            if self.verbose > 0:
                print('Lost %.1f%% information ' % (self.pca.noise_variance_) +
                      '[%dmin %dsec]' % (int(elapsed / 60), int(elapsed % 60)))
                elapsed = time.time() - t
        else:
            y_ = y

        if labels is not None:
            raise RuntimeError('Not implemented.')

        if cv_indices is None:
            cv_indices = np.arange(X.shape[0])
        if self.cv_type is None:
            kfold = RepeatedKFold(n_splits=self.cv_nfolds,
                                  n_repeats=self.cv_shuffles)
            cv_folds = kfold.split(X[cv_indices])
            n_cv_folds = kfold.get_n_splits()
        elif self.cv_type == 'iter':
            cv_folds = self.cv_groups
            n_cv_folds = len(self.cv_groups)
        elif self.cv_type == 'group':
            groups = self.cv_groups
            if self.cv_nfolds is None:
                self.cv_nfolds = len(np.unique(groups))
            kfold = GroupKFold(n_splits=self.cv_nfolds)
            cv_folds = kfold.split(X[cv_indices], y[cv_indices], groups)
            n_cv_folds = kfold.get_n_splits()
        else:
            raise Exception('Cross-validation type not supported')

        add_train_inds = np.setdiff1d(np.arange(X.shape[0]), cv_indices)
        cv_folds = list(cv_folds)
        cv_folds = [(np.concatenate((train_fold, add_train_inds)), test_fold)
                    for train_fold, test_fold in cv_folds]

        if self.verbose > 0:
            elapsed = time.time() - t
            print('Computing distance matrix [%dmin %dsec]' %
                  (int(elapsed / 60), int(elapsed % 60)))
            sys.stdout.flush()

        if dist is None:
            dist = euclidean_distances(X, None, squared=self.squared_dist)
            if dist_savename is not None:
                if self.verbose > 0:
                    print('Saving distance matrix to file:', dist_savename)
                np.save(dist_savename, dist)

        if importance_weights is None:
            self.krr_param_grid['lambda'] = [0]
            importance_weights = np.ones((X.shape[0], ))

        importance_weights = importance_weights**(0.5)

        errors = []
        if 'v' in self.krr_param_grid:
            for fold_i, (train_i, test_i) in enumerate(cv_folds):
                fold_errors = np.empty(
                    (len(self.krr_param_grid['v']),
                     len(self.krr_param_grid['gamma']), 1,
                     len(self.krr_param_grid['alpha']), y_.shape[1]))
                if self.verbose > 0:
                    elapsed = time.time() - t
                    print('CV %d of %d [%dmin %dsec]' %
                          (fold_i + 1, n_cv_folds, int(
                              elapsed / 60), int(elapsed % 60)))
                    sys.stdout.flush()
                for v_i, v in enumerate(self.krr_param_grid['v']):
                    for gamma_i, gamma in enumerate(
                            self.krr_param_grid['gamma']):
                        for lamb_i, lamb in enumerate(
                                self.krr_param_grid['lambda']):
                            iw = importance_weights**lamb
                            iw = iw[:, None]
                            K_train = self.kernel.apply_to_dist(dist[np.ix_(
                                train_i, train_i)],
                                                                gamma=gamma)
                            K_train *= np.outer(iw[train_i], iw[train_i])
                            K_test = self.kernel.apply_to_dist(dist[np.ix_(
                                test_i, train_i)],
                                                               gamma=gamma)
                        if self.verbose > 0:
                            sys.stdout.write('.')
                            sys.stdout.flush()
                            for alpha_i, alpha in enumerate(
                                    self.krr_param_grid['alpha']):
                                if self.verbose > 0:
                                    sys.stdout.write(',')
                                    sys.stdout.flush()
                                for y_i in np.arange(y_.shape[1]):
                                    K_train_ = K_train.copy()
                                    alpha_add = get_alpha_add(
                                        self.n_basis, self.n_grid, self.delta,
                                        v)
                                    K_train_.flat[::K_train_.shape[0] +
                                                  1] += alpha * alpha_add[y_i]
                                    try:
                                        L_ = cholesky(K_train_, lower=True)
                                        x = solve_triangular(L_,
                                                             y_[train_i, y_i],
                                                             lower=True)
                                        dual_coef_ = solve_triangular(L_.T, x)
                                        pred_mean = np.dot(K_test, dual_coef_)
                                        if self.mae:
                                            e = np.mean(
                                                np.abs(pred_mean -
                                                       y_[test_i, y_i]), 0)
                                        else:
                                            e = np.mean((pred_mean -
                                                         y_[test_i, y_i])**2,
                                                        0)
                                    except np.linalg.LinAlgError:
                                        e = np.inf
                                    fold_errors[v_i, gamma_i, 0, alpha_i,
                                                y_i] = e
                if self.verbose > 0:
                    sys.stdout.write('\n')
                    sys.stdout.flush()
                errors.append(fold_errors)
            errors = np.array(errors)
            errors = np.mean(errors, 0)  # average over folds
        else:
            for fold_i, (train_i, test_i) in enumerate(cv_folds):
                fold_errors = np.empty(
                    (len(self.krr_param_grid['gamma']),
                     len(self.krr_param_grid['lambda']),
                     len(self.krr_param_grid['alpha']), y_.shape[1]))
                if self.verbose > 0:
                    elapsed = time.time() - t
                    print('CV %d of %d [%dmin %dsec]' %
                          (fold_i + 1, n_cv_folds, int(
                              elapsed / 60), int(elapsed % 60)))
                    sys.stdout.flush()
                for gamma_i, gamma in enumerate(self.krr_param_grid['gamma']):
                    if self.verbose > 0:
                        sys.stdout.write('.')
                        sys.stdout.flush()
                    for lamb_i, lamb in enumerate(
                            self.krr_param_grid['lambda']):
                        iw = importance_weights**lamb
                        iw = iw[:, None]
                        K_train = self.kernel.apply_to_dist(dist[np.ix_(
                            train_i, train_i)],
                                                            gamma=gamma)
                        K_train *= np.outer(iw[train_i], iw[train_i])
                        K_test = self.kernel.apply_to_dist(dist[np.ix_(
                            test_i, train_i)],
                                                           gamma=gamma)
                        for alpha_i, alpha in enumerate(
                                self.krr_param_grid['alpha']):
                            if self.verbose > 0:
                                sys.stdout.write(',')
                                sys.stdout.flush()
                            K_train_ = K_train.copy()
                            K_train_.flat[::K_train_.shape[0] + 1] += alpha
                            try:
                                L_ = cholesky(K_train_, lower=True)
                                x = solve_triangular(L_,
                                                     iw[train_i] * y_[train_i],
                                                     lower=True)
                                dual_coef_ = iw[train_i] * solve_triangular(
                                    L_.T, x)
                                pred_mean = np.dot(K_test, dual_coef_)
                                if self.mae:
                                    e = np.mean(
                                        np.abs(pred_mean - y_[test_i]) *
                                        importance_weights[test_i, None]**2, 0)
                                else:
                                    e = np.mean(
                                        ((pred_mean - y_[test_i])**2) *
                                        importance_weights[test_i, None]**2, 0)
                            except np.linalg.LinAlgError:
                                e = np.inf
                            fold_errors[gamma_i, lamb_i, alpha_i] = e
                if self.verbose > 0:
                    sys.stdout.write('\n')
                    sys.stdout.flush()
                errors.append(fold_errors)
            errors = np.array(errors)
            errors = np.mean(errors, 0)  # average over folds

        self.dual_coefs_ = np.empty((y_.shape[1], X.shape[0]))
        self.alphas_ = np.empty(y_.shape[1])
        self.lambdas_ = np.empty(y_.shape[1])
        self.gammas_ = np.empty(y_.shape[1])
        if self.verbose > 0:
            elapsed = time.time() - t
            print('Refit [%dmin %dsec]' %
                  (int(elapsed / 60), int(elapsed % 60)))
            sys.stdout.flush()
        print_count = 0

        if not self.single_combo:
            for i in range(y_.shape[1]):
                min_params = np.argsort(errors[:, :, :, i], axis=None)
                # lin_alg_errors = 0
                gamma_i, lamb_i, alpha_i = np.unravel_index(
                    min_params[0], errors.shape[:2])
                gamma = self.krr_param_grid['gamma'][gamma_i]
                lamb = self.krr_param_grid['lambda'][lamb_i]
                alpha = self.krr_param_grid['alpha'][alpha_i]
                self.alphas_[i] = alpha
                self.gammas_[i] = gamma
                self.lambdas_[i] = lamb

                if (gamma_i in (0, len(self.krr_param_grid['gamma']) - 1) or
                        lamb_i in (0, len(self.krr_param_grid['lambda']) - 1)
                        or alpha_i
                        in (0, len(self.krr_param_grid['alpha']) - 1)):
                    if print_count <= 200:
                        fmtstr = '%d: gamma=%g\talpha=%g\tlambda=%g\terror=%g\tmean=%g'
                        print(fmtstr % (i, gamma, alpha, lamb,
                                        errors[gamma_i, lamb_i, alpha_i, i],
                                        errors[gamma_i, lamb_i, alpha_i, i] /
                                        np.mean(np.abs(y_[:, i]))))
                        print_count += 1
        else:
            errors = np.mean(errors, -1)  # average over outputs
            if self.verbose > 1:
                print('CV errors:')
                print(errors)
                print('Alpha params:')
                print(self.krr_param_grid['alpha'])
                print('Gamma params:')
                print(self.krr_param_grid['gamma'])
                print('Lambda params:')
                print(self.krr_param_grid['lambda'])
            if self.verbose > 0:
                print('Min error: ', np.min(errors))

            # print np.log(errors)
            # plt.imshow(np.log(errors))
            # plt.xticks(range(10), map('{:.1e}'.format, list(self.krr_param_grid['alpha'])))
            # plt.yticks(range(10), map('{:.1e}'.format, list(self.krr_param_grid['gamma'])))
            # plt.xlabel('alpha')
            # plt.ylabel('gamma')
            # plt.colorbar()
            # plt.show()
            min_params = np.argsort(errors, axis=None)
            if 'v' in self.krr_param_grid:
                v_i, gamma_i, lamb_i, alpha_i = np.unravel_index(
                    min_params[0], errors.shape)
            else:
                gamma_i, lamb_i, alpha_i = np.unravel_index(
                    min_params[0], errors.shape)
            if 'v' in self.krr_param_grid:
                v = self.krr_param_grid['v'][v_i]
                print('v=', v)
            gamma = self.krr_param_grid['gamma'][gamma_i]
            alpha = self.krr_param_grid['alpha'][alpha_i]
            lamb = self.krr_param_grid['lambda'][lamb_i]

            if 'v' in self.krr_param_grid:
                if v == self.krr_param_grid['v'][0]:
                    print('v at lower edge.')
                if v == self.krr_param_grid['v'][-1]:
                    print('v at upper edge.')
            if len(self.krr_param_grid['gamma']) > 1:
                if gamma == self.krr_param_grid['gamma'][0]:
                    print('Gamma at lower edge.')
                if gamma == self.krr_param_grid['gamma'][-1]:
                    print('Gamma at upper edge.')
            if len(self.krr_param_grid['alpha']) > 1:
                if alpha == self.krr_param_grid['alpha'][0]:
                    print('Alpha at lower edge.')
                if alpha == self.krr_param_grid['alpha'][-1]:
                    print('Alpha at upper edge.')
            if len(self.krr_param_grid['lambda']) > 1:
                if lamb == self.krr_param_grid['lambda'][0]:
                    print('Lambda at lower edge.')
                if lamb == self.krr_param_grid['lambda'][-1]:
                    print('Lambda at upper edge.')
            self.alphas_[:] = alpha
            self.gammas_[:] = gamma
            self.lambdas_[:] = lamb

            if 'v' in self.krr_param_grid:
                alpha_add = get_alpha_add(self.n_basis, self.n_grid,
                                          self.delta, v)
                self.alphas_ *= alpha_add

        combos = list(zip(self.alphas_, self.gammas_, self.lambdas_))
        n_unique_combos = len(set(combos))
        self.L_fit_ = [None] * n_unique_combos
        for i, (alpha, gamma, lamb) in enumerate(set(combos)):
            if self.verbose > 0:
                elapsed = time.time() - t
                print('Parameter combinations ' + '%d of %d [%dmin %dsec]' %
                      (i + 1, n_unique_combos, int(elapsed / 60),
                       int(elapsed % 60)))
                sys.stdout.flush()
            y_list = [
                i for i in range(y_.shape[1]) if self.alphas_[i] == alpha
                and self.gammas_[i] == gamma and self.lambdas_[i] == lamb
            ]

            iw = importance_weights**lamb
            iw = iw[:, None]
            K = self.kernel.apply_to_dist(dist, gamma=gamma)
            K *= np.outer(iw, iw)
            # np.exp(K, K)
            while True:
                K.flat[::K.shape[0] + 1] += alpha - (alpha / 10)
                try:
                    if self.verbose > 0:
                        print('trying cholesky decomposition, alpha', alpha)
                    L_ = cholesky(K, lower=True)
                    self.L_fit_[i] = L_
                    x = solve_triangular(L_, iw * y_[:, y_list], lower=True)
                    # x = solve_triangular(L_, y_[:, y_list], lower=True)
                    dual_coef_ = solve_triangular(L_.T, x)
                    self.dual_coefs_[y_list] = iw.T * dual_coef_.T.copy()
                    break
                except np.linalg.LinAlgError:
                    if self.verbose > 0:
                        print('LinalgError, increasing alpha')
                    alpha *= 10
                    self.alphas_[0] = alpha

        if self.copy_X:
            self.X_fit_ = X.copy()
            self.y_fit_ = y.copy()
        else:
            self.X_fit_ = X
            self.y_fit_ = y
        self.errors = errors

        if self.verbose > 0:
            elapsed = time.time() - t
            print('Done [%dmin %dsec]' %
                  (int(elapsed / 60), int(elapsed % 60)))
            sys.stdout.flush()
Ejemplo n.º 22
0
    y = train[target_col].values
    id_train = train[id_col].values

    X_test = test.drop(cols_to_drop, axis=1, errors='ignore')
    id_test = test[id_col].values

    feature_names = list(X.columns)
    n_features = X.shape[1]
    dprint(f'n_features: {n_features}')

    p_test = []
    dfs_train = []
    dfs_test = []

    for fold_i_oof, (train_index_oof,
                     valid_index_oof) in enumerate(kf1.split(X, y)):
        x_train_oof = X.iloc[train_index_oof]
        x_valid_oof = X.iloc[valid_index_oof]

        y_train_oof = y[train_index_oof]
        y_valid_oof = y[valid_index_oof]

        id_train_oof = id_train[valid_index_oof]

        for fold_i, (train_index, valid_index) in enumerate(
                kf2.split(x_train_oof, y_train_oof)):
            params = lgb_params.copy()

            x_train = x_train_oof.iloc[train_index]
            x_valid = x_train_oof.iloc[valid_index]
Ejemplo n.º 23
0
    markers = [x for x in df.columns if 'aal' in x]
    X = df[markers].values
    y = 1 - (df['Final diagnosis (behav)'] == 'VS').values.astype(np.int)

    results = dict()
    results['Iteration'] = []
    results['Weight Val'] = []
    results['Classifier'] = []
    results['AUC'] = []
    results['Precision'] = []
    results['Recall'] = []

    sss = RepeatedKFold(n_splits=5, n_repeats=50, random_state=42)

    for t_iter, (train, test) in enumerate(sss.split(X, y)):
        for val in weight_val:
            classifiers['SVC_fs10'] = Pipeline([
                    ('scaler', RobustScaler()),
                    ('select', SelectKBest(f_classif, 10)),
                    ('clf', SVC(kernel="linear", C=1,  probability=True,
                                class_weight={0: 1, 1: val}))
                ])
            classifiers['XRF'] = Pipeline([
                    ('scaler', RobustScaler()),
                    ('clf', ExtraTreesClassifier(
                            max_depth=5, n_estimators=2000, max_features='auto',
                            class_weight={0: 1, 1: val}))
                ])
            classifiers['Dummy'] = Pipeline([
                    ('clf', DummyClassifier(
Ejemplo n.º 24
0
def predictConditions(query):
    
    print('Collecting all conditions:\n')
    
    conditions = getFeatures.get_conditions(
        query=query, startDate='2019-01-01', endDate='2020-12-31')
    
    print(conditions)
    

    patients = getFeatures.get_live_patients(
        query=query, startDate='2019-12-31', endDate='2019-12-31')
    
    print('\nNumber of patients', len(patients))
    

    age_groups = getFeatures.make_age_groups()
    
    print('\nAge groups\n', age_groups)
    

    print('\nCompute features: ')
    
    x_df = getFeatures.get_feature_vec(
        query,
        conditions=conditions,
        startDate='2019-01-01', 
        endDate='2019-12-31', 
        age_groups=age_groups)

    print('\nx_df.shape ', x_df.shape)
    
    print('\nCompute labels: ')
    
    y_df = getFeatures.get_feature_vec(
        query,
        conditions=conditions,
        startDate='2020-01-01', 
        endDate='2020-12-31', 
        age_groups=age_groups)
    
    print('\ny_df.shape ', y_df.shape)
    

    train, test = train_test_split(patients, test_size=0.25, random_state=42)
    
    x_train_df = x_df.loc[train]
    y_train_df = y_df.loc[train]
    x_test_df = x_df.loc[test]
    y_test_df = y_df.loc[test]
    
    print('\n\nTrain set:', len(train), 'Test set: ', len(test))
    
    print(
        '\n\nSorted x_train means:\n\n',
        x_train_df.mean().sort_values(ascending=False), 
        '\n\nSorted y_train means:\n\n',
        y_train_df.mean().sort_values(ascending=False)
    )

    filter_below = 20
    print('\nFiltereing conditions with less than {} cases:'.format(filter_below))
    
    x_drop_list = ( 
        set(x_train_df.columns[x_train_df.sum() < filter_below])
        | set(x_test_df.columns[x_train_df.sum() < filter_below])
    )

    x_train_df = x_train_df.drop(x_drop_list, axis=1)
    x_test_df = x_test_df.drop(x_drop_list, axis=1)

    y_drop_list = ( 
        set(y_train_df.columns[y_train_df.sum() < filter_below])
        | set(y_test_df.columns[y_train_df.sum() < filter_below])
    )

    y_train_df = y_train_df.drop(y_drop_list, axis=1)
    y_test_df = y_test_df.drop(y_drop_list, axis=1)

    print(
        '\n\nSorted x_train means:\n\n',
        x_train_df.mean().sort_values(ascending=False), 
        '\n\nSorted y_train means:\n\n\n\n',
        y_train_df.mean().sort_values(ascending=False)
    )
    
    print(
        '\n\nSorted x_test means:\n\n',
        x_test_df.mean().sort_values(ascending=False), 
        '\n\nSorted y_test means:\n\n\n\n',
        y_test_df.mean().sort_values(ascending=False)
    )

    y_weights = 1 / (y_train_df.var() + 1e-3)
    y_weights = y_weights/(y_train_df.var()*y_weights).sum()
    
    print(
        '\n',
        pd.DataFrame(
            [y_train_df.var(), y_weights, y_weights*y_train_df.var()],
             index=['y_train var', 'y_weights', 'var*weight']
        ).transpose()
    )

    wmse = feature_weighted_mse.make_feature_weighted_mse(y_weights)
    
    print(
        '\nBasic benchmark - y means\n', 
        'Train loss',
        wmse(
            y_true=y_train_df.values, 
            y_pred=y_train_df.values.mean(axis=0)
        ).numpy().mean(),
    )

    from sklearn.model_selection import RepeatedKFold

    n_splits = 4
    n_repeats = 2
    alpha=0.00001
    learning_rate=0.001
    patience=30
    
    print('\nTrain linear model using Lasso alpha {} {}-fold CV repeated {} times.\n'.format(
        alpha, n_splits, n_repeats,
    ))
    

    rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=42)
    
    models=[]
    history=[]
    performance=[]
    
    i=0
    for train_index, validate_index in rkf.split(x_train_df):
        
        i += 1
        print('\n\nFold {} out of {}\n\n'.format(i, n_splits*n_repeats))
        
        x_train, x_validate = x_train_df.iloc[train_index], x_train_df.iloc[validate_index]
        y_train, y_validate = y_train_df.iloc[train_index], y_train_df.iloc[validate_index]
    
        inputs = keras.layers.Input(shape=x_train_df.shape[1])
        outputs = keras.layers.Dense(
            units=y_train_df.shape[1], 
            kernel_regularizer=keras.regularizers.l1(l=alpha),
        )(inputs)
        
        models.append(keras.Model(inputs=inputs, outputs=outputs))

        models[-1].compile(loss=wmse, optimizer=keras.optimizers.Adam(learning_rate=learning_rate))

        history.append(models[-1].fit(
            x=x_train,
            y=y_train,
            batch_size=128,
            epochs=1000,
            validation_data=(x_validate, y_validate),
            callbacks=[
                keras.callbacks.EarlyStopping(patience=patience, restore_best_weights=True),
            ]
        ))
    
        print('\nEvaluate on test set:\n')
        performance.append(models[-1].evaluate(x=x_test_df, y=y_test_df))
        print(performance[-1],'\n')

    print('Test loss mean', np.mean(performance), 'std' , np.std(performance, ddof=1))
    

    
    constant_full = pd.DataFrame(
        np.array([model.layers[1].get_weights()[1] for model in models]).transpose(), 
        index=y_train_df.columns, 
        columns=['Fold {}'.format(i) for i in range(1, 1+n_splits*n_repeats)],
    )
    constant_full.to_csv('constant_full.csv')
    
    coef_mat = np.array([model.layers[1].get_weights()[0] for model in models]).transpose((1, 2, 0))
    
    coef_full = pd.DataFrame(
        [[json.dumps(coef_mat[i,j].tolist()) 
          for j in range(coef_mat.shape[1])] 
         for i in range(coef_mat.shape[0])], 
        columns=y_train_df.columns, 
        index=x_train_df.columns
    ).transpose()
    
    coef_full.to_csv('coef_full.csv')
Ejemplo n.º 25
0
        xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit) / folds.n_splits
plot_importance(clf)
plt.show()
print("CV score: {:<8.8f}".format(mean_squared_error(oof_xgb, target)))

# 将lgb和xgb的结果进行stacking

train_stack = np.vstack([oof_lgb, oof_xgb]).transpose()
test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose()

folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4590)
oof_stack = np.zeros(train_stack.shape[0])
predictions = np.zeros(test_stack.shape[0])

for fold_, (trn_idx,
            val_idx) in enumerate(folds_stack.split(train_stack, target)):
    print("fold {}".format(fold_))
    trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values
    val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values

    clf_3 = BayesianRidge()
    clf_3.fit(trn_data, trn_y)

    oof_stack[val_idx] = clf_3.predict(val_data)
    predictions += clf_3.predict(test_stack) / 10

print(mean_squared_error(target.values, oof_stack))

sub_df = pd.read_csv('./jinnan_round1_submit_20181227.csv', header=None)
sub_df[1] = predictions
sub_df[1] = sub_df[1].apply(lambda x: round(x, 3))
Ejemplo n.º 26
0
def boston():
    X = load_boston()['data']
    y = load_boston()['target']
    cv = RepeatedKFold(n_splits=10, n_repeats=4)
    return X, y, cv.split(X)
Ejemplo n.º 27
0
#     # t = ttest_ind(
#     #     X[c].fillna(X[c].mean()), 
#     #     X_test[c].fillna(X_test[c].mean()))
#     t = ks_2samp(
#         X[c].dropna(), 
#         X_test[c].dropna())
#     # print(c, t)
#     if t[1] < 0.001:
#         print(c, t)
#         cols_to_drop.append(c)
# print(f'Dropping after statistical tests: {cols_to_drop}')
# X = X.drop(cols_to_drop, axis=1, errors='ignore')
# X_test = X_test.drop(cols_to_drop, axis=1, errors='ignore')

p_test = []
for fold_i, (train_index, valid_index) in enumerate(kf.split(X, y)):
    x_train = X.iloc[train_index].copy()
    x_valid = X.iloc[valid_index].copy()

    y_train = y[train_index]
    y_valid = y[valid_index]

    x_test = X_test.copy()

    # Frequency encoding
    for c in cat_features:
    # for c in ['hospital_id']:
        if c in x_train.columns:
            encoding = x_train.groupby(c).size()
            encoding = encoding/len(x_train)
            x_train[f'{c}_fe'] = x_train[c].map(encoding)
Ejemplo n.º 28
0
def energy():
    df = pd.read_csv(f'{datasets_folder}/energy_efficiency.csv')
    X = df.iloc[:, :-2]
    y = df.iloc[:, -2]
    cv = RepeatedKFold(n_splits=10, n_repeats=4)
    return X, y, cv.split(X)
Ejemplo n.º 29
0
plot = plt.scatter(y_test, y_pred)

# In[30]:

from sklearn.metrics import roc_auc_score

print(confusion_matrix(y_test, y_pred))
#print(roc_auc_score(y-test,y_pred))

# In[31]:

from sklearn.model_selection import RepeatedKFold
random_state = 12883823
rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=random_state)
for train, test in rkf.split(x):
    print("%s %s" % (train, test))

# In[32]:

from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
for train, test in loo.split(x):
    print("%s %s" % (train, test))

# In[33]:

get_ipython().run_line_magic('matplotlib', 'inline')
svclassifier = SVC(kernel='rbf', C=1)
svclassifier.fit(x_train, y_train)
y_pred = svclassifier.predict(x_test)
Ejemplo n.º 30
0
def power():
    df = pd.read_csv(f'{datasets_folder}/power.csv')
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    cv = RepeatedKFold(n_splits=10, n_repeats=4)
    return X, y, cv.split(X)
dep_f = []
mse_f = []
rmse_f = []
mae_f = []
mdae_f = []
evs_f = []
r2_f = []
for i in dep:
    c = 0
    mse_t = []
    rmse_t = []
    mae_t = []
    mdae_t = []
    evs_t = []
    r2_t = []
    for tr_i, ts_i in rkf.split(data):
        print(i, c)
        train, test = data.iloc[tr_i], data.iloc[ts_i]
        train_x = train.drop(columns=['Rainfall'])
        train_y = train['Rainfall']
        test_x = test.drop(columns=['Rainfall'])
        test_y = test['Rainfall']
        model = RandomForestRegressor(n_estimators=100, max_depth=i)
        model.fit(train_x, train_y)
        ts_p = model.predict(test_x)
        mse_t.append(mse(test_y, ts_p))
        mae_t.append(mae(test_y, ts_p))
        mdae_t.append(mdae(test_y, ts_p))
        evs_t.append(evs(test_y, ts_p))
        r2_t.append(r2(test_y, ts_p))
        c += 1
Ejemplo n.º 32
0
def wine():
    df = pd.read_csv(f'{datasets_folder}/wine.csv', sep=';')
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    cv = RepeatedKFold(n_splits=10, n_repeats=4)
    return X, y, cv.split(X)
        if len(daily_attention) == age and len(daily_share) == age:
            attention_data.append(daily_attention)
            share_data.append(daily_share)
            vid_array.append(vid)

    # convert to ndarray
    attention_data = np.array(attention_data)
    share_data = np.array(share_data)
    vid_array = np.array(vid_array)

    # == == == == == == == == Part 4: Forecast future attention == == == == == == == == #
    # 10-repeated 10-fold cross validation
    rkf = RepeatedKFold(n_splits=10, n_repeats=10)

    fold_idx = 0
    for train_cv_idx, test_idx in rkf.split(vid_array):
        fold_idx += 1
        print('>>> Forecast on fold: {0}'.format(fold_idx))

        # == == == == == == == == Part 5: Split cv subset to select best alpha value == == == == == == == == #
        train_idx, cv_idx = train_test_split(train_cv_idx, test_size=0.1)

        # grid search best alpha value over -4 to 4 in log space
        alpha_array = [10 ** t for t in range(-4, 5)]
        cv_mse = []
        for alpha in alpha_array:
            # == == == == == == == == Part 6: Training with Ridge Regression == == == == == == == == #
            cv_predict = forecast_future_attention(train_idx, cv_idx, alpha)

            # == == == == == == == == Part 7: Evaluate cv mean squared error == == == == == == == == #
            cv_norm = np.sum(attention_data[cv_idx, :age], axis=1)
Ejemplo n.º 34
0
def lasso_train(groups, varname='valence', arrayname='norm', alpha=None,
                use_lars=True, fit_intercept=True, normalize=True,
                cv_folds=None, cv_repeats=None, skip_cv=False,
                xmin=-np.inf, xmax=np.inf, _larch=None, **kws):

    """use a list of data groups to train a Lasso/LassoLars model

    Arguments
    ---------
      groups      list of groups to use as components
      varname     name of characteristic value to model ['valence']
      arrayname   string of array name to be fit (see Note 3) ['norm']
      xmin        x-value for start of fit range [-inf]
      xmax        x-value for end of fit range [+inf]
      alpha       alpha parameter for LassoLars (See Note 5) [None]
      use_lars    bool to use LassoLars instead of Lasso [True]
      cv_folds    None or number of Cross-Validation folds (Seee Note 4) [None]
      cv_repeats  None or number of Cross-Validation repeats (Seee Note 4) [None]
      skip_cv     bool to skip doing Cross-Validation [None]

    Returns
    -------
      group with trained LassoLars model, to be used with lasso_predict
    Notes
    -----
     1.  The group members for the components must match each other
         in data content and array names.
     2.  all grouops must have an attribute (scalar value) for `varname`
     3.  arrayname can be one of `norm` or `dmude`
     4.  Cross-Validation:  if cv_folds is None, sqrt(len(groups)) will be used
            (rounded to integer).  if cv_repeats is None, sqrt(len(groups))-1
            will be used (rounded).
     5.  alpha is the regularization parameter. if alpha is None it will
         be set using LassoLarsSCV
    """
    xdat, spectra = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax)
    groupnames = []
    ydat = []
    for g in groups:
        groupnames.append(getattr(g, 'filename',
                                  getattr(g, 'groupname', repr(g))))
        val = getattr(g, varname, None)
        if val is None:
            raise Value("group '%s' does not have attribute '%s'" % (g, varname))
        ydat.append(val)
    ydat = np.array(ydat)

    nvals = len(groups)

    kws.update(dict(fit_intercept=fit_intercept, normalize=normalize))
    creator = LassoLars if use_lars else Lasso
    model = None

    rmse_cv = None
    if not skip_cv:
        if cv_folds is None:
            cv_folds = int(round(np.sqrt(nvals)))
        if  cv_repeats is None:
            cv_repeats = int(round(np.sqrt(nvals)) - 1)

        cv = RepeatedKFold(n_splits=cv_folds, n_repeats=cv_repeats)
        if alpha is None:
            lcvmod = LassoLarsCV(cv=cv, max_n_alphas=1e7,
                                 max_iter=1e7, eps=1.e-12, **kws)
            lcvmod.fit(spectra, ydat)
            alpha = lcvmod.alpha_

        model = creator(alpha=alpha, **kws)
        resid = []
        for ctrain, ctest in cv.split(range(nvals)):
            model.fit(spectra[ctrain, :], ydat[ctrain])
            ypred = model.predict(spectra[ctest, :])
            resid.extend((ypred - ydat[ctest]).tolist())
        resid = np.array(resid)
        rmse_cv = np.sqrt( (resid**2).mean() )

    if alpha is None:
        cvmod = creator(**kws)
        cvmod.fit(spectra, ydat)
        alpha = cvmod.alpha_

    if model is None:
        model = creator(alpha=alpha, **kws)

    # final fit without cross-validation
    out = model.fit(spectra, ydat)

    ypred = model.predict(spectra)

    rmse = np.sqrt(((ydat - ypred)**2).mean())

    return Group(x=xdat, spectra=spectra, ydat=ydat, ypred=ypred,
                 alpha=alpha, active=model.active_, coefs=model.coef_,
                 cv_folds=cv_folds, cv_repeats=cv_repeats,
                 rmse_cv=rmse_cv, rmse=rmse, model=model, varname=varname,
                 arrayname=arrayname, fit_intercept=fit_intercept,
                 normalize=normalize, groupnames=groupnames, keywords=kws)