Example #1
0
def get_results_protein():
    train, test = read_data.get_data("Grupa5_data/protein.RData")
    x_train = pd.DataFrame(train.iloc[:, 0:2000])
    x_test = pd.DataFrame(test)

    x_train = read_data.normalize_data(x_train)
    x_test = read_data.normalize_data(x_test)
    y_train = train.iloc[:, 2000]

    # param = {'alpha': 1e-2}
    ridge = Ridge(alpha=0.01)
    cross_validation.cross_validate(x_train, y_train, ridge)
def crossval(k, configspath, gpu, dataset, savefolder, num_epochs, verbose):
    """
    Cross validates the data.
    """
    if not savefolder:
        datefolder = None
    else:
        if not os.path.exists(savefolder):
            os.makedirs(savefolder)

        datasetpath = os.path.join(savefolder, 'dataset')
        with open(datasetpath, 'wb') as datasetfile:
            cPickle.dump(dataset, datasetfile, cPickle.HIGHEST_PROTOCOL)


    if not configspath:
        configs = network_configs.random_network_configs(k)
    else:
        configs = []
        for line in open(configspath):
            line = line.split(' ')
            configs.append( (int(line[0]), int(line[1]), float(line[2][:-1])) )

    if verbose:
        print(configs)
        print('Dataset read')
        print('Cross validation with %d folds' % len(configs))

    best_config = cross_validation.cross_validate(configs, dataset, savefolder=savefolder, \
            num_epochs=num_epochs, verbose=verbose, gpu=gpu)
    return best_config
Example #3
0
def test_cv():
    n = 100
    x = np.array([1, 1])
    a1 = 10
    a2 = 25

    y = np.empty((2, n))
    np.random.seed(42)
    y[0] = a1 * x[0] + 3 * np.random.randn(n)
    y[1] = a2 * x[0] + 5 * np.random.randn(n)

    k = 5
    splitter = ds.KFold(n, k, randomize=True)

    params = (a1, a2)
    lamda1 = np.array([0.3, 3, 30])
    lamda2 = np.array([0.5, 5, 50])
    lamdas_list = [lamda1, lamda2]

    for verbose, save_x_hats in product([True, False], [True, False]):
        ret = cross_validate(y, splitter, mle, params, lamdas_list,
                             l2_error, params, verbose, save_x_hats)

        if save_x_hats:
            (lamda_stars, lamda_star_indices, error, mean_error, x_hats) = ret
        else:
            (lamda_stars, lamda_star_indices, error, mean_error) = ret

        print(mean_error)
        assert lamda_star_indices[0] == 1
        assert lamda_star_indices[1] == 1
Example #4
0
def get_results_cancer():
    train, test = read_data.get_data("Grupa5_data/cancer.RData")
    x_train = pd.DataFrame(train.iloc[0:17737, :])
    y_train = train.iloc[17737, :]
    x_test = pd.DataFrame(test)

    x_train = x_train.T
    y_train = y_train.T
    x_test = x_test.T

    print(x_train)
    print(x_test)

    x_train = read_data.normalize_data(x_train)
    x_test = read_data.normalize_data(x_test)

    ridge = Ridge()
    cross_validation.cross_validate(x_train, y_train, ridge)
def process_lambda_grid_search(id, y, X, fold_count, seed, gd_func, max_iters, gamma, lamb, degree):
    N, D = X.shape
    initial_w = np.ones(D)

    # k-fold cross-validation
    (w_stars, train_correct_ratios, test_correct_ratios) = \
        cross_validation.cross_validate(id, y, X, fold_count, seed, gd_func,
                                        initial_w, max_iters, gamma, lamb)

    filename = "train_clean_avg_L2_degree{degree}_fold{fold}_gamma{gamma}_iter{iter}_lamb{lamb}.pickle".format(degree=degree, fold=fold_count, gamma=gamma, iter=max_iters, lamb=lamb)
    with open(filename, "wb") as pickle_file:
        pickle.dump((w_stars, train_correct_ratios, test_correct_ratios), pickle_file)
def main():
    # when the attributes have different data range
    heterogeneous_data = mock_Chinese_stock_price.get_stockset_various()

    # in this dataset, I have added investment, and employee number,
    # they all have large numbers and will influence the results significantly without normalization,
    # then those more important attributes with smaller values may not influence the result and the final result cannot be accurate
    print "before re-scale/normalization"
    cv_total_error_unweighted = cross_validation.cross_validate(heterogeneous_data, algr=KNN.get_KNN, trails=100)
    cv_total_error_weighted = cross_validation.cross_validate(heterogeneous_data, algr=KNN.get_weightedKNN, trails=100)
    print "cross validation, using un-weighted KNN: ", cv_total_error_unweighted
    print "cross validation, using weighted KNN: ", cv_total_error_weighted

    print "after re-scale"
    scale = [10, 10, 10, 0.00001, 0]
    scaled_data = rescale(heterogeneous_data, scale)
    scaled_cv_total_error_unweighted = cross_validation.cross_validate(scaled_data, algr=KNN.get_KNN, trails=100)
    scaled_cv_total_error_weighted = cross_validation.cross_validate(scaled_data, algr=KNN.get_weightedKNN, trails=100)
    print "cross validation, using un-weighted KNN: ", scaled_cv_total_error_unweighted
    print "cross validation, using weighted KNN: ", scaled_cv_total_error_weighted

    print "after normalization"
    min_max = [(1, 10), (1, 20), (1, 50), (10000, 10000000)]
    normalized_data = normalization(heterogeneous_data, min_max)
    normalized_cv_total_error_unweighted = cross_validation.cross_validate(
        normalized_data, algr=KNN.get_KNN, trails=100
    )
    normalized_cv_total_error_weighted = cross_validation.cross_validate(
        normalized_data, algr=KNN.get_weightedKNN, trails=100
    )
    print "cross validation, using un-weighted KNN: ", normalized_cv_total_error_unweighted
    print "cross validation, using weighted KNN: ", normalized_cv_total_error_weighted
def main():
    # when the attributes have different data range
    heterogeneous_data = mock_Chinese_stock_price.get_stockset_various()
    
    # in this dataset, I have added investment, and employee number, 
    # they all have large numbers and will influence the results significantly without normalization, 
    # then those more important attributes with smaller values may not influence the result and the final result cannot be accurate
    print 'before re-scale/normalization'
    cv_total_error_unweighted = cross_validation.cross_validate(heterogeneous_data, algr = KNN.get_KNN, trails=100)
    cv_total_error_weighted = cross_validation.cross_validate(heterogeneous_data, algr = KNN.get_weightedKNN, trails=100)
    print 'cross validation, using un-weighted KNN: ', cv_total_error_unweighted
    print 'cross validation, using weighted KNN: ', cv_total_error_weighted
    
    print 'after re-scale'
    scale = [10, 10, 10, 0.00001, 0]
    scaled_data = rescale(heterogeneous_data, scale)
    scaled_cv_total_error_unweighted = cross_validation.cross_validate(scaled_data, algr = KNN.get_KNN, trails=100)
    scaled_cv_total_error_weighted = cross_validation.cross_validate(scaled_data, algr = KNN.get_weightedKNN, trails=100)
    print 'cross validation, using un-weighted KNN: ', scaled_cv_total_error_unweighted
    print 'cross validation, using weighted KNN: ', scaled_cv_total_error_weighted
    
    print 'after normalization'
    min_max = [(1,10), (1,20), (1,50), (10000, 10000000)]
    normalized_data = normalization(heterogeneous_data, min_max)
    normalized_cv_total_error_unweighted = cross_validation.cross_validate(normalized_data, algr = KNN.get_KNN, trails=100)
    normalized_cv_total_error_weighted = cross_validation.cross_validate(normalized_data, algr = KNN.get_weightedKNN, trails=100)
    print 'cross validation, using un-weighted KNN: ', normalized_cv_total_error_unweighted
    print 'cross validation, using weighted KNN: ', normalized_cv_total_error_weighted
def estimate_meta(directories, trainer, range_values, label, static_features):
    images = load(directories, True, permute=True)
    results = []
    for v, feature in range_values:
        print("Optimizing %s: %f" % (feature.key(), v))
        feature_calculator = [feature] + static_features
        error_rate = cross_validate(images, feature_calculator, trainer, k=10, verbose=False)
        print("Error rate: %f" % error_rate)
        results.append([v, error_rate])

        for i in images:
            i.reset(feature)

    plot = ScatterPlot(ylabel='error rate', xlabel='param')
    name = inspect.stack()[1][3]
    plot.save([label], [results], "result_graphs/" + name)
Example #9
0
def estimate_meta(directories, trainer, range_values, label, static_features):
    images = load(directories, True, permute=True)
    results = []
    for v, feature in range_values:
        print("Optimizing %s: %f" % (feature.key(), v))
        feature_calculator = [feature] + static_features
        error_rate = cross_validate(images,
                                    feature_calculator,
                                    trainer,
                                    k=10,
                                    verbose=False)
        print("Error rate: %f" % error_rate)
        results.append([v, error_rate])

        for i in images:
            i.reset(feature)

    plot = ScatterPlot(ylabel='error rate', xlabel='param')
    name = inspect.stack()[1][3]
    plot.save([label], [results], "result_graphs/" + name)
Example #10
0
model2 = Sequential()
model2.add(Dense(input_shape=(180,), units=80, activation=sigmoid))
model2.add(Dense(80, activation=sigmoid))
model2.add(Dense(80, activation=sigmoid))
model2.add(Dense(1, activation=linear))
model2.compile(optimizer="adam", loss=mean_squared_error, metrics=['mse'])

model3 = Sequential()
model3.add(Dense(input_shape=(180,), units=80, activation=sigmoid))
model3.add(Dense(80, activation=sigmoid))
model3.add(Dense(80, activation=sigmoid))
model3.add(Dense(1, activation=linear))
model3.compile(optimizer="adam", loss=mean_squared_error, metrics=['mse'])

mses_model1 = cross_validate(model, x_train, y_train, epochs=20, verbose=0)
mses_model2 = cross_validate(model2, x_train, y_train, epochs=20, verbose=0)
mses_model3 = cross_validate(model3, x_train, y_train, epochs=20, verbose=0)

# RNN
featu_enc, x, y = get_data(data, dynamic=True)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)

rnn_model = Sequential()
rnn_model.add(Embedding(input_dim=21, output_dim=50, input_length=18))
rnn_model.add(LSTM(100, return_sequences=False))
rnn_model.add(Dense(1, activation=linear))
rnn_model.compile(optimizer="adam", loss=mean_squared_error, metrics=['mse'])

rnn_model2 = Sequential()
rnn_model2.add(Embedding(input_dim=21, output_dim=50, input_length=18))
Example #11
0
    # print(x_train)
    # print(x_train)
    # print(y_train)

    # rf = RandomForestRegressor(n_estimators=100)
    # cross_validation.cross_validate(x_train, y_train, rf)
    #
    # rf = RandomForestRegressor(n_estimators=100)
    # features = feature_engineering.recursive_feature_engineering(rf, x_train, y_train)
    #
    # rf = RandomForestRegressor(n_estimators=100)
    # cross_validation.cross_validate(x_train[features], y_train, rf)

    # rf.fit(x_train, y_train)
    # lr = LogisticRegression()
    # rf = RandomForestRegressor()
    # dt = DecisionTreeRegressor()
    xgb = XGBRegressor()
    # sv = svm.SVR()
    # model = sv
    model = xgb
    model.fit(x_train, y_train)
    cross_validation.cross_validate(x_train, y_train, model)
    f_imp = model.feature_importances_
    cut_off = np.quantile(f_imp, q=(1-0.0025))
    feature_list = np.where(f_imp >= cut_off)
    features = feature_list
    # features = feature_engineering.svm_feature_engineering(x_train, y_train)
    print(features)
Example #12
0
"""
Launch a cross-validation on three models to see which is best on wine data.
Model to test: KNeighborsClassifier, DecisionTreeClassifier and MLPClassifier.

Author: Claudio Sousa, David Gonzalez
"""

from sklearn import datasets
from cross_validation import cross_validate, plot_validation, output_csv, normalise_data
from models import instanciate_kneighbors_model, instanciate_decisiontree_model, instanciate_mlp_model

data = datasets.load_wine()
data.data = normalise_data(data.data)

models = [
    instanciate_kneighbors_model(1, 11),
    instanciate_decisiontree_model(1, 11),
    instanciate_mlp_model()
]

best_model = cross_validate(data, models, 5, 10)
output_csv(models, best_model, "wine")
plot_validation(models, best_model)
Example #13
0
def forward_stepwise_selection(
    func_builder: FuncBuilder,
    data: pd.DataFrame,
    fs: List[str],
    e: str,
    seg_col: str,
    K: int,
) -> (List[str], Dict[int, float]):

    best_fs = [[]]
    while len(fs) > 0:
        best_score = 0
        best_scoring_feature = ""
        first_check = True

        for f in fs:
            tmp_fs = best_fs[-1][:]
            tmp_fs.append(f)
            clf = func_builder(tmp_fs, e, seg_col)
            clf.fit(data)

            feature_score, _ = clf.score(data, "r2")

            if first_check or feature_score > best_score:
                best_score = feature_score
                best_scoring_feature = f
                first_check = False

        best_fs.append(best_fs[-1] + [best_scoring_feature])
        fs.remove(best_scoring_feature)

    scores_by_step = {}
    res_index = 0
    best_score = 0
    first_check = True
    for i in range(len(best_fs)):
        clf = func_builder(best_fs[i], e, seg_col)

        r2_scores, _ = cv.cross_validate(clf, data, ["r2"], K)
        tmp_r2_score = r2_scores["r2"]

        scores_by_step[i] = tmp_r2_score

        if first_check or tmp_r2_score > best_score:
            best_score = tmp_r2_score
            res_index = i
            first_check = False

    return best_fs[res_index], scores_by_step


# def forward_stepwise_selection(
#         df : pd.DataFrame,
#         fs: List[str],
#         e: str,
#         K: int) -> List[str] :

#     best_features = []
#     while (len(best_features) < len(fs)) :
#         remaining_features = list(set(fs) - set(best_features))
#         scores = {}

#         for f in remaining_features :
#             tmp = best_features
#             tmp.append(f) # {x_1, x_2, a}
#             clf = wrappers.ProyectionRegression(
#                 features=tmp,
#                 explain=e
#             )
#             clf.fit(df)
#             clf.predict(df)

#             scores[f] = clf.score(df, "r2")
#         best_features.append(max(scores.items(), key=operator.itemgetter(1))[0])

#     predictor_score = {}
#     pred_features = []
#     for f in best_features:
#         pred_features.append(f)
#         clf = wrappers.ProyectionRegression(
#             features = pred_features,
#             explain = e,
#         )
#         predictor_score[f] = cv.cross_validate(clf, df, "r2", K)

#     f = max(predictor_score.items(), key=operator.itemgetter(1))[0]
#     res = []
#     for k in best_features:
#         if k != f:
#             res.append(k)
#         break
#     res.append(f)
#     return res
 def costf(scale):
     rescaled_data = rescale(data, scale)
     cost = cross_validation.cross_validate(rescaled_data, algr, trails)
     return cost
Example #15
0
import json

dump_final = []
if(cross_val):
    dump = []

for i in range(len(model_list)):
    model = model_list[i]
    optimizer = optimizer_list[i](model.parameters(),lr=lr_list[i])
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,scheduler_gamma_list[i])
    criterion = criterion_list[i]
    print("Model {} of {} : {}".format(i+1,len(model_list),model.name()))
    if(cross_val):
        k_fold = 4
        print("Cross validating... on {} folds".format(k_fold))
        tr_loss, val_loss, tr_err, val_err = cv.cross_validate(model, criterion,optimizer,scheduler,train_input,train_target,k_fold,batch_size=10,n_epochs=epochs_list[i],n_augmentation=0,verbose=2)
        print("Mean train error : {}, mean validation error : {}".format(tr_err[-1],val_err[-1]))
        dump.append((model.name(), " train ", tr_loss, tr_err))
        dump.append((model.name(), " validation ", val_loss, val_err))

    model.reset()
    print("Training...")
    cv.train_model(model,criterion,optimizer,scheduler,train_input,train_target,n_epochs=epochs_list[i],batch_size=10,n_augmentation=0,verbose=2)
    final_tr_error = cv.evaluate_error(model,train_input,train_target)
    final_te_error = cv.evaluate_error(model,test_input,test_target)
    print("Train error = {} ; Test error = {} ".format(final_tr_error,final_te_error))

    dump_final.append((model.name(), " train ", final_tr_error.item()))
    dump_final.append((model.name(), " test " , final_te_error.item()))

Example #16
0
shuffle(all_settings)

# Save results in a dict, mapping settings to results

grid_filename = 'gridsearch2'

# If the grid search was started previously, load the results
# Otherwise, start from an empty dict
if os.path.exists(filepath(grid_filename)):
    with open(filepath(grid_filename), 'rb') as f:
        results = pickle.load(f)
else:
    results = {}

# For each setting that hasn't already been tried,
# apply cross-validation, and save the scores
for pre_set, train_set in all_settings:
    if pre_set + train_set in results:
        continue
    print(pre_set, train_set)
    scores = cross_validate(messages,
                            gold,
                            folds,
                            get_vectoriser,
                            preproc_args=pre_set,
                            train_kwargs=train_kwargs(*train_set))
    print(scores)
    results[pre_set + train_set] = scores
    with open(filepath(grid_filename), 'wb') as f:
        pickle.dump(results, f)