Esempio n. 1
0
def rolling_model_PLS(df_X, df_Y):
    split_num = 200 * 60
    X_traindata = df_X[:split_num * 2]
    Y_traindata = df_Y[:split_num * 2]
    X_vdata = df_X[split_num:split_num * 2]
    X_testdata = df_X[split_num * 2:split_num * 3]
    Y_testdata = df_Y[split_num * 2:split_num * 3]

    # specify parameters and distributions to sample from

    num_valid_size = len(X_traindata) - len(X_vdata)
    test_fold = -1 * np.ones(len(X_traindata))
    test_fold[num_valid_size:] = 0
    ps = PredefinedSplit(test_fold)

    # specify parameters and distributions to sample from
    param_dist = {'n_components': sp_randint(1, 100), 'max_iter': sp_randint(50, len(X_traindata)),
                  'tol': [0.0001, 0.00001, 0.000001, 0.0000001]}
    # param_dist = {'n_components':[3,4]}
    PLS_model = PLSRegression(scale=False)

    # run gridsearchcv make_scorer(r2_score)
    n_iter_search = 50
    estim = RandomizedSearchCV(PLS_model, param_distributions=param_dist, scoring='r2',
                               cv=ps.split(), iid=False, n_jobs=1, n_iter=n_iter_search)

    estim.fit(X_traindata, Y_traindata)
    best_estimator = estim.best_estimator_
    v_pred = best_estimator.predict(df_X[:split_num])
    v_performance_score = r2_score(df_Y[:split_num], v_pred)
    test_pre_y_array = best_estimator.predict(X_testdata)
    test_performance_score = r2_score(Y_testdata, test_pre_y_array)

    return v_performance_score, test_performance_score
Esempio n. 2
0
def test_jdata_fscorer_class():
    monkey_patch.run()

    user_sku_pair, data, target, pred_proba, test_fold, expected_scores = (
        get_jdata_test_cases())

    pred_map = {}
    clf = MockEstimatorWithPredefinedPrediction(pred_map)
    ps = PredefinedSplit(test_fold)

    for train_index, test_index in ps.split():
        print("TRAIN:", train_index, "TEST:", test_index)
        clf.set(data[train_index, :], pred_proba[train_index])
        clf.set(data[test_index, :], pred_proba[test_index])

    scoring = {
        "custom_score_index": JDataScore(),
        "custom_score_index_with_user_sku_pair": JDataScore(user_sku_pair),
    }
    scores = cross_validate(clf,
                            data,
                            target,
                            scoring=scoring,
                            cv=ps,
                            return_estimator=True)

    for name in scoring.keys():
        assert_almost_equal(scores[f"test_{name}"], expected_scores)
Esempio n. 3
0
def decode(X, y, cv_ids, model):
    """
    Parameters
    --------------
    X: np.array, n_stimuli x n_voxels
    y: np.array, n_stimuli, 
    cv_ids: np.array - n_stimuli, 
    
    Return
    --------------
    models, scores
    """
    scores = []
    models = []
    ps = PredefinedSplit(cv_ids)
    for train_index, test_index in ps.split():
        # split the data
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # fit the model on the training set
        model.fit(X_train, y_train)
        # calculate the accuracy for the hold out run
        score = model.score(X_test, y_test)
        # save stuff
        models.append(deepcopy(model))
        scores.append(score)
    return models, scores
Esempio n. 4
0
    def validate(self, cv_splits, num_runs):
        x = pd.concat([self.x_train, self.x_val], axis=0)
        y = pd.concat([self.y_train, self.y_val], axis=0)

        if cv_splits == 1:
            splitter = PredefinedSplit([-1 for _ in range(len(x) - 12)] + [0 for _ in range(12)])
            split = list(splitter.split(X=x, y=y)) * num_runs
        else:
            splitter = TimeSeriesSplit(cv_splits, max_train_size=len(x) - 12)
            split = list(splitter.split(X=x, y=y)) * num_runs

        res = map(self._validate, split)
        res = np.mean(list(res), axis=0)

        # K.clear_session()

        return res[0][0], res[1][0]
Esempio n. 5
0
def main(argv):
    start_time = datetime.now()
    logger.info("START")
    args = argparser.parse_args()
    inFile = args.inFile
    testFile = args.testFile
    nameModel = args.nameModel
    conf_file = args.mod
    mod = __import__(conf_file, fromlist=['*'])
    model_conf = mod.gridSearch_Model_types[nameModel]
    conf = getattr(__import__(conf_file, fromlist=[model_conf]), model_conf)
    prefix_dict = conf['prefix_dict']
    out_dict = h.outfileName(fo=args.outFile,
                             fi=inFile,
                             prefix_dict=prefix_dict,
                             add_date=True)
    logger.info("RUNNING WITH MOD: %s, INFILE: %s" % (conf_file, inFile))
    logger.info("LOADING THE DATA SET")
    param_grid = PARAM_DICT[nameModel]
    # scoring = {'Accuracy': make_scorer(accuracy_score),'RMS':make_scorer(mean_squared_error)}
    scoring = {'RMS': make_scorer(r2_score)}
    X, Y, len_train, numFeatures = readFile(inFile)
    cv = None
    if testFile:
        logger.info("USING TEST FILE %s AS TEST SET FOR THE CORSS VALIDATION" %
                    testFile)
        X_test, Y_test, len_train_test, numFeatures_test = readFile(inFile)
        X = pd.concat([X, X_test], ignore_index=True)
        Y = pd.concat([Y, Y_test], ignore_index=True)
        cv_arr = [1] * len_train
        cv_arr.extend([0] * len_train_test)
        cv = PredefinedSplit(test_fold=cv_arr)
        print("Stampa di cv: ", cv)
        print("numero di fold", cv.get_n_splits())
        for train_index, test_index in cv.split():
            print("TRAIN:", train_index, "TEST:", test_index)
        logger.info("SHAPE OF X:%s AND Y:%s AFTER APPEND", X.shape, Y.shape)
    logger.info("CREATION OF THE MODEL")
    t = TestClass(conf=conf, nm=nameModel, nf=numFeatures)
    if nameModel == 'NN':
        model = KerasClassifier(build_fn=t.createModelNN)
        X = X.as_matrix()
        Y = Y.as_matrix()
    else:
        model = t.selectModel()
    logger.info("START GRID SEARCH")
    grid_result = gridSearch(model, param_grid, cv, X, Y, scoring)
    logger.info("END OF GRID SEARCH")
    logger.info("PRINTING RESULTS")
    gridResults(grid_result, X, nameModel)
    SaveModel(nameModel, grid_result)
    logger.info("EXECUTED IN %f SEC" %
                ((datetime.now() - start_time)).total_seconds())
    logger.info("END")
Esempio n. 6
0
def split_dataset(dataset):
    X = dataset.drop(y_col, axis=1)
    y = dataset[y_col]
    test_fold = (fold_pattern * (
        (dataset.shape[0] - 1) // len(fold_pattern) + 1))[:dataset.shape[0]]
    splitter = PredefinedSplit(test_fold)
    for train_index, test_index in splitter.split():
        X_train, X_test = safe_indexing(X, train_index), safe_indexing(
            X, test_index)
        y_train, y_test = safe_indexing(y, train_index), safe_indexing(
            y, test_index)
    return X_train, y_train, X_test, y_test
Esempio n. 7
0
    def split(self, data):
        """Perform a data split with a fixed size for the test set"""
        data_size = 0
        if data.is_row_split_validation():
            #Time series split data by columns
            data_size = data.get_features().shape[1]
        else:
            data_size = data.get_features().shape[0]

        test_fold = [-1 for i in range(0, data_size - self.test_size_)]
        test_fold += [0 for i in range(data_size - self.test_size_, data_size)]
        splitter = PredefinedSplit(test_fold=test_fold)
        return splitter.split()
Esempio n. 8
0
def rolling_model_ENetH(df_X, df_Y):
    split_num = 200 * 60
    X_traindata = df_X[:split_num * 2]
    Y_traindata = df_Y[:split_num * 2]
    X_vdata = df_X[split_num:split_num * 2]
    X_testdata = df_X[split_num * 2:split_num * 3]
    Y_testdata = df_Y[split_num * 2:split_num * 3]

    # specify parameters and distributions to sample from

    num_valid_size = len(X_traindata) - len(X_vdata)
    test_fold = -1 * np.ones(len(X_traindata))
    test_fold[num_valid_size:] = 0
    ps = PredefinedSplit(test_fold)

    # specify parameters and distributions to sample from
    param_dist = {
        'alpha': uniform(0.00001, 0.1),
        'power_t': uniform(0.1, 0.9),
        'l1_ratio': uniform(0.1, 0.9),
        'eta0': uniform(0.00001, 0.1),
        'epsilon': uniform(0.01, 0.9),
        'max_iter': sp_randint(5, 10000),
        'tol': [0.01, 0.001, 0.0001, 0.00001],
        'fit_intercept': [True, False]
    }

    clf = SGDRegressor(shuffle=False,
                       loss='huber',
                       penalty='elasticnet',
                       random_state=100)

    # run randomized search
    n_iter_search = 100
    estim = RandomizedSearchCV(clf,
                               param_distributions=param_dist,
                               n_iter=n_iter_search,
                               scoring='r2',
                               cv=ps.split(),
                               iid=False,
                               random_state=100,
                               n_jobs=1)

    estim.fit(X_traindata, Y_traindata)
    best_estimator = estim.best_estimator_
    v_pred = best_estimator.predict(df_X[:split_num])
    v_performance_score = r2_score(df_Y[:split_num], v_pred)
    test_pre_y_array = best_estimator.predict(X_testdata)
    test_performance_score = r2_score(Y_testdata, test_pre_y_array)

    return v_performance_score, test_performance_score
Esempio n. 9
0
def aggregate_fold_stats(db_paths, cv_pkl_file):
    preprocessed_db = imglmdb.multidbwrapper(sorted(db_paths))
    with open(cv_pkl_file, "rb") as pkl:
        test_fold, nested_test_folds = pickle.load(pkl)

    splitter = PredefinedSplit(test_fold)

    data = [{}] * splitter.get_n_splits()

    for i, (nested_test_fold,
            (_,
             test_idx)) in enumerate(zip(nested_test_folds, splitter.split())):
        per_pixel_stats = preprocessing.compute_per_pixel_stats(
            preprocessed_db, None, idx=test_idx)
        std_per_pixel = numpy.where(per_pixel_stats[1] == 0.0, 1,
                                    per_pixel_stats[1])
        data[i]["outer"] = (per_pixel_stats[0], std_per_pixel)

        nested_splitter = PredefinedSplit(nested_test_fold)
        data[i]["nested"] = [{}] * nested_splitter.get_n_splits()

        for j, (train_idx, val_idx) in enumerate(nested_splitter.split()):
            per_pixel_stats = preprocessing.compute_per_pixel_stats(
                preprocessed_db, None, idx=train_idx)
            std_per_pixel = numpy.where(per_pixel_stats[1] == 0.0, 1,
                                        per_pixel_stats[1])
            data[i]["nested"][j]["train"] = (per_pixel_stats[0], std_per_pixel)

            per_pixel_stats = preprocessing.compute_per_pixel_stats(
                preprocessed_db, None, idx=val_idx)
            std_per_pixel = numpy.where(per_pixel_stats[1] == 0.0, 1,
                                        per_pixel_stats[1])
            data[i]["nested"][j]["val"] = (per_pixel_stats[0], std_per_pixel)

    with open(os.path.splitext(cv_pkl_file)[0] + "_stats.pkl", "wb") as pkl:
        pickle.dump(data, pkl)

    return data
Esempio n. 10
0
    def train(self, data, clf='rf', param_search='single', tune_size=0.15,
              scoring='roc_auc', n_jobs=1, verbose=1):
        """Trains a classifier with the specified training data.
        data: tuple including training data.
        clf: string of {'rf' 'lr', 'xgb'}.
        Returns trained classifier."""
        x_train, y_train, _, features = data

        if param_search == 'single' or tune_size == 0:
            model, params = self.classifier(clf, param_search='single')
            model.set_params(**params)

        elif tune_size > 0:
            t1 = self.out('tuning...')
            model, params = self.classifier(clf, param_search=param_search)
            train_len = x_train.shape[0]

            split_ndx = train_len - int(train_len * tune_size)
            sm_x_train, x_val = x_train[:split_ndx], x_train[split_ndx:]
            sm_train_fold = np.full(sm_x_train.shape[0], -1)
            val_fold = np.full(x_val.shape[0], 0)

            predefined_fold = np.append(sm_train_fold, val_fold)
            ps = PredefinedSplit(predefined_fold)
            cv = ps.split(x_train, y_train)
            m = GridSearchCV(model, params, scoring=scoring, cv=cv,
                             verbose=verbose, n_jobs=n_jobs)
            m.fit(x_train, y_train)
            model = m.best_estimator_
            self.time(t1)

        t1 = self.out('training...')

        if clf == 'lgb':
            cat_feat = ['app', 'device', 'os', 'channel', 'hour']
            cat_feat_ndx = [features.index(x) for x in cat_feat]
            train_len = x_train.shape[0]
            split_ndx = train_len - int(train_len * tune_size)
            sm_x_train, x_val = x_train[:split_ndx], x_train[split_ndx:]
            sm_y_train, y_val = y_train[:split_ndx], y_train[split_ndx:]
            eval_set = (x_val, y_val)
            model = model.fit(sm_x_train, sm_y_train, eval_set=eval_set,
                              early_stopping_rounds=50, eval_metric='auc',
                              categorical_feature=cat_feat_ndx)
        else:
            model = model.fit(x_train, y_train)

        self.time(t1)
        self.out(str(model))
        return model
Esempio n. 11
0
def train_self(scoring='accuracy'):
    csv_dir = Path("Features/CSV")
    for i in range(10):
        train_test_dir = csv_dir / f"train_test{i}"
        results_dir = Path("Results") / f"self{i}"
        results_dir.mkdir(exist_ok=True)
        for dataset_file in train_test_dir.glob("*test*"):
            dataset = str(dataset_file.stem).split("_test")[0]
            suffixes = ["train", "train_train", "train_val"]
            keys = [f"{s}" for s in suffixes]
            df_dict = {
                key: pd.read_csv(train_test_dir / f"{dataset}_{key}.csv")
                for key in keys
            }

            #xgboost with eval
            ###################################
            data = pd.concat([df_dict["train_train"], df_dict["train_val"]],
                             axis=0)
            data.reset_index(inplace=True, drop=True)
            val_idx = np.concatenate(
                ((-1) * np.ones(df_dict["train_train"].shape[0]),
                 np.zeros(df_dict["train_val"].shape[0])))
            ps = PredefinedSplit(val_idx)
            X = data.drop(columns=["Label", "microRNA_name"])
            y = data.Label.ravel()
            train_index, val_index = next(ps.split())
            X_val = X.iloc[val_index]
            y_val = y[val_index]

            output_file = results_dir / f"{dataset}_xgbs_val_results.csv"
            print(output_file)
            if not output_file.exists():
                clf = XGBClassifier(silent=True)
                grid_obj = GridSearchCV(clf,
                                        XGBS_PARAMS,
                                        scoring=scoring,
                                        cv=ps,
                                        verbose=3)
                fit_params = {
                    "eval_set": [(X_val, y_val)],
                    "early_stopping_rounds": 50
                }
                grid_obj.fit(X, y, **fit_params)

                print('\n Best estimator:')
                print(grid_obj.best_estimator_)
                print(grid_obj.best_score_ * 2 - 1)
                results = pd.DataFrame(grid_obj.cv_results_)
                results.to_csv(output_file, index=False)
Esempio n. 12
0
class LeavePOutByGroup():
    def __init__(self, X, p=5, n_splits=2):
        self.X = X
        self.p = p
        self.n_splits = n_splits
        test_fold = self.X.groupby("user_id").cumcount().apply(
            lambda x: int(x / p) if x < (n_splits * p) else -1)
        self.s = PredefinedSplit(test_fold)

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

    def split(self, X=None, y=None, groups=None):
        return self.s.split()
Esempio n. 13
0
            def rolling_model_PLS(
                    X_traindata=X_traindata,
                    Y_traindata_demean=np.ravel(Y_traindata_demean),
                    X_traindata1=X_traindata1,
                    Y_traindata1=np.ravel(Y_traindata1),
                    X_testdata=X_testdata,
                    Y_testdata=np.ravel(Y_testdata),
                    mean_Ytrain=mean_Ytrain):

                # specify parameters and distributions to sample from

                split_num = 200 * 60
                num_valid_size = split_num
                test_fold = -1 * np.ones(len(X_traindata))
                test_fold[num_valid_size:] = 0
                ps = PredefinedSplit(test_fold)

                # specify parameters and distributions to sample from
                param_dist = {
                    'n_components': sp_randint(1, 31),
                    'max_iter': sp_randint(50, len(X_traindata)),
                    'tol': [0.0001, 0.00001, 0.000001, 0.0000001]
                }

                PLS_model = PLSRegression(scale=False)

                # run gridsearchcv make_scorer(r2_score)
                n_iter_search = 50
                estim = RandomizedSearchCV(PLS_model,
                                           param_distributions=param_dist,
                                           scoring='r2',
                                           cv=ps.split(),
                                           iid=False,
                                           n_jobs=-1,
                                           n_iter=n_iter_search)

                estim.fit(X_traindata, Y_traindata_demean)
                best_estimator = estim.best_estimator_

                train_predict = best_estimator.predict(
                    X_traindata1) + mean_Ytrain
                IS_score = r2_score(Y_traindata1, train_predict)

                test_predict = best_estimator.predict(X_testdata) + mean_Ytrain
                test_predict = test_predict[:, 0]
                OOS_score = 1 - np.sum(
                    (Y_testdata - test_predict)**2) / np.sum(
                        (Y_testdata - mean_Ytrain)**2)

                return IS_score, OOS_score
def predefined_train_test_split(data, labels, folds, workflow, label_encoder):
    folds = np.asarray(folds)
    
    fold_encoder = LabelEncoder()
    split_encoded = fold_encoder.fit_transform(folds)
    
    num_classes = len(label_encoder.classes_)
    
    performance = {
        'classes': label_encoder.classes_.tolist(),
        'intervals': {key: np.sum(folds == key) for key in sorted(list(set(folds)))}
    }
    
    split = PredefinedSplit(split_encoded)
    for fold_index, (train_inds, test_inds) in enumerate(split.split()):
        train_x, train_y = [data[ii] for ii in train_inds], [labels[ii] for ii in train_inds]
        test_x, test_y = [data[ii] for ii in test_inds], [labels[ii] for ii in test_inds]
        
        prior_train = [0] * num_classes
        for yy in train_y:
            prior_train[yy] += 1
        
        prior_test = [0] * num_classes
        for yy in test_y:
            prior_test[yy] += 1
        
        clf = deepcopy(workflow)
        clf.fit(train_x, train_y)
        param_dict = {kk: vv.__dict__ for kk, vv in clf.named_steps.iteritems()}
        
        test_pred = clf.predict(test_x)
        
        test_ind = folds[test_inds[0]]
        performance[test_ind] = {
            'accuracy': metrics.accuracy_score(test_y, test_pred),
            'precision_micro': metrics.precision_score(test_y, test_pred, average='micro'),
            'precision_macro': metrics.precision_score(test_y, test_pred, average='micro'),
            'recall_micro': metrics.recall_score(test_y, test_pred, average='micro'),
            'recall_macro': metrics.recall_score(test_y, test_pred, average='macro'),
            'f1_score_micro': metrics.f1_score(test_y, test_pred, average='micro'),
            'f1_score_macro': metrics.f1_score(test_y, test_pred, average='macro'),
            'confusion_matrix': metrics.confusion_matrix(test_y, test_pred).tolist(),
            'prior_train': prior_train,
            'prior_test': prior_test,
            'model': serialise_dict(param_dict)
        }
    
    return serialise_dict(performance)
def neighbors(train, test, target, cv: PredefinedSplit, k=5, n_trees=10):
    res_train = np.zeros((train.shape[0], 2))
    res_test = np.zeros((test.shape[0], 2))
    for i, (trn_idx, val_idx) in tqdm(enumerate(cv.split(train)),
                                      total=cv.get_n_splits()):
        target_trn = target.iloc[trn_idx]
        X_trn = train.iloc[trn_idx]
        X_val = train.iloc[val_idx]
        n = X_trn[target_trn == 0]
        p = X_trn[target_trn == 1]
        for j, X in enumerate([n, p]):
            u = build(X, n_trees)
            res_train[val_idx, j] = get_feat(X_val, u, k=k)
            res_test[:, j] += get_feat(test, u, k)
    res_test /= cv.get_n_splits()
    return res_train, res_test
def k_fold_cv(X, y, feature_desc):
    # since fold 4 will be used as a blind set and not part of training, it is removed from fold_ids list.
    fold_ids = pd.read_csv(
        "data/raw_data/CV_fold_ids_trval.csv")['FoldID'][0:132]
    ps = PredefinedSplit(fold_ids)
    fold_id = 0
    y = y[valence_classifier.label_type]
    for train_index, test_index in ps.split():
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf = tune_on_devset(X_train, y_train, X_test, y_test)
        joblib.dump(
            clf,
            "data/models/" + feature_desc + "_fold" + str(fold_id) + '.pkl')
        fold_id += 1
    return
Esempio n. 17
0
            def rolling_model_RF(X_traindata=X_traindata,
                                 Y_traindata_demean=np.ravel(Y_traindata_demean),
                                 X_traindata1=X_traindata1,
                                 Y_traindata1=np.ravel(Y_traindata1),
                                 X_testdata=X_testdata,
                                 Y_testdata=np.ravel(Y_testdata),
                                 mean_Ytrain=mean_Ytrain):


                # specify parameters and distributions to sample from

                split_num = 200 * 60
                num_valid_size = split_num
                test_fold = -1 * np.ones(len(X_traindata))
                test_fold[num_valid_size:] = 0
                ps = PredefinedSplit(test_fold)

                # specify parameters and distributions to sample from
                param_dist = {"max_features": sp_randint(5, 100),
                              "max_depth": sp_randint(3, 10),
                              "min_samples_split": sp_randint(10, 1000),
                              "min_samples_leaf": sp_randint(10, 1000),
                              "n_estimators": sp_randint(3, 100),
                              "oob_score": [True, False]
                              }

                clf_RF = RandomForestRegressor(random_state=100)

                # run randomized search
                n_iter_search = 50
                estim = RandomizedSearchCV(clf_RF, param_distributions=param_dist,
                                           n_iter=n_iter_search, scoring='r2',n_jobs=-1,
                                           cv=ps.split(), iid=False, random_state=100)

                estim.fit(X_traindata, Y_traindata_demean)
                best_estimator = estim.best_estimator_

                best_VIP = best_estimator.feature_importances_

                train_predict = best_estimator.predict(X_traindata1) + mean_Ytrain
                IS_score = r2_score(Y_traindata1, train_predict)

                test_predict = best_estimator.predict(X_testdata) + mean_Ytrain
                OOS_score = 1- np.sum((Y_testdata-test_predict)**2)/sum((Y_testdata-mean_Ytrain)**2)


                return IS_score, OOS_score, best_VIP
Esempio n. 18
0
def rolling_model_GBRTH(df_X, df_Y):
    split_num = 200 * 60
    X_traindata = df_X[:split_num * 2]
    Y_traindata = df_Y[:split_num * 2]
    X_vdata = df_X[split_num:split_num * 2]
    X_testdata = df_X[split_num * 2:split_num * 3]
    Y_testdata = df_Y[split_num * 2:split_num * 3]

    # specify parameters and distributions to sample from

    num_valid_size = len(X_traindata) - len(X_vdata)
    test_fold = -1 * np.ones(len(X_traindata))
    test_fold[num_valid_size:] = 0
    ps = PredefinedSplit(test_fold)

    # specify parameters and distributions to sample from
    param_dist = {
        "max_features": sp_randint(5, 100),
        "max_depth": sp_randint(3, 12),
        "min_samples_split": sp_randint(100, 1000),
        "min_samples_leaf": sp_randint(100, 1000),
        "n_estimators": sp_randint(5, 100),
        "learning_rate": uniform(0.001, 0.1),
        "subsample": uniform(0.6, 0.4)
    }

    clf_GBRT = GradientBoostingRegressor(loss='huber', random_state=100)

    # run randomized search
    n_iter_search = 100
    estim = RandomizedSearchCV(clf_GBRT,
                               param_distributions=param_dist,
                               n_iter=n_iter_search,
                               scoring='r2',
                               cv=ps.split(),
                               iid=False,
                               random_state=100)

    estim.fit(X_traindata, Y_traindata)
    best_estimator = estim.best_estimator_
    v_pred = best_estimator.predict(df_X[:split_num])
    v_performance_score = r2_score(df_Y[:split_num], v_pred)
    test_pre_y_array = best_estimator.predict(X_testdata)
    test_performance_score = r2_score(Y_testdata, test_pre_y_array)

    return v_performance_score, test_performance_score
def test_predefinedsplit_with_kfold_split():
    # Check that PredefinedSplit can reproduce a split generated by Kfold.
    folds = -1 * np.ones(10)
    kf_train = []
    kf_test = []
    for i, (train_ind, test_ind) in enumerate(KFold(5, shuffle=True).split(X)):
        kf_train.append(train_ind)
        kf_test.append(test_ind)
        folds[test_ind] = i
    ps_train = []
    ps_test = []
    ps = PredefinedSplit(folds)
    # n_splits is simply the no of unique folds
    assert_equal(len(np.unique(folds)), ps.get_n_splits())
    for train_ind, test_ind in ps.split():
        ps_train.append(train_ind)
        ps_test.append(test_ind)
    assert_array_equal(ps_train, kf_train)
    assert_array_equal(ps_test, kf_test)
Esempio n. 20
0
def test_predefinedsplit_with_kfold_split():
    # Check that PredefinedSplit can reproduce a split generated by Kfold.
    folds = -1 * np.ones(10)
    kf_train = []
    kf_test = []
    for i, (train_ind, test_ind) in enumerate(KFold(5, shuffle=True).split(X)):
        kf_train.append(train_ind)
        kf_test.append(test_ind)
        folds[test_ind] = i
    ps_train = []
    ps_test = []
    ps = PredefinedSplit(folds)
    # n_splits is simply the no of unique folds
    assert_equal(len(np.unique(folds)), ps.get_n_splits())
    for train_ind, test_ind in ps.split():
        ps_train.append(train_ind)
        ps_test.append(test_ind)
    assert_array_equal(ps_train, kf_train)
    assert_array_equal(ps_test, kf_test)
Esempio n. 21
0
def target_encoding(X_train, y_train, X_test, cols, cv_id):
    cols = list(cols)
    train_new = X_train.copy()
    test_new = X_test.copy()
    test_new[:] = 0
    cv = PredefinedSplit(cv_id)
    X_train.index = X_train.index.astype(int)
    for trn_idx, val_idx in tqdm(cv.split(X_train), total=cv.get_n_splits()):
        enc = TargetEncoder(cols=cols)
        enc.fit(X_train.iloc[trn_idx], y_train[trn_idx])
        train_new.iloc[val_idx] = enc.transform(X_train.iloc[val_idx])
        test_new += enc.transform(X_test)
    test_new /= cv.get_n_splits()
    train_new = train_new[cols]
    test_new = test_new[cols]
    train_new.columns = train_new.columns + '_target'
    test_new.columns = test_new.columns + '_target'
    print(list(train_new.columns))
    return train_new, test_new
Esempio n. 22
0
    def extract_data(self, df_dict: dict):
        for key in df_dict.keys():
            df_dict[key] = drop_unnamed(df_dict[key])

        data = pd.concat([df_dict["train_train"], df_dict["train_val"]],
                         axis=0)
        data.reset_index(inplace=True, drop=True)
        val_idx = np.concatenate(
            ((-1) * np.ones(df_dict["train_train"].shape[0]),
             np.zeros(df_dict["train_val"].shape[0])))
        ps = PredefinedSplit(val_idx)
        X, y = self.extract_Xy(data)

        train_index, val_index = next(ps.split())
        X_val = X.iloc[val_index]
        y_val = y[val_index]

        X_test, y_test = self.extract_Xy(df_dict["test"])
        assert set(X.columns) == set(
            X_test.columns), f"""X and X_test must have the same columns.
         {np.setdiff1d(set(X.columns), set(X_test.columns))}"""

        return X, y, X_val, y_val, X_test, y_test, ps
Esempio n. 23
0
    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.

        y : array-like, shape (n_samples,)
            The target variable for supervised learning problems.

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.

        Yields
        ------
        train : ndarray
            The training set indices for that split.

        test : ndarray
            The testing set indices for that split.
        """
        X, y, groups = indexable(X, y, groups)
        n_samples = X.shape[0]
        if self.n_splits > n_samples:
            raise ValueError(
                ("Cannot have number of splits n_splits={0} greater"
                 " than the number of samples: n_samples={1}."
                 ).format(self.n_splits, n_samples))

        # generate test fold
        test_fold = np.arange(n_samples, dtype=int) % self.n_splits
        cv = PredefinedSplit(test_fold)

        return(cv.split())
Esempio n. 24
0
lgb_params = {
    'n_estimators': 1000,
    'learning_rate': 0.1,
    'num_leaves': 31,
    'colsample_bytree': 0.8,
    'subsample': 0.9,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'min_split_gain': 0.01,
    'min_child_weight': 2,
    'random_state': 77
}

print('5-fold CV')
score = cross_validate(lgb.LGBMClassifier(**lgb_params), X_train, y_train, cv=cv.split(X_train, y_train),
                       scoring='roc_auc', n_jobs=4, verbose=4)

valid_score = score['test_score'].mean()
print('val:', valid_score)

print('train')
model = lgb.LGBMClassifier(**lgb_params)
model.fit(X_train, y_train)

print(f'val = {valid_score};\nfeats = {feats};\nlgb_params = {lgb_params}')
generate_submit(model.predict_proba(X_test)[:, 1], f'{NAME}_{valid_score:.4f}')

print('output feature importances')
feat_df = pd.DataFrame({'importance': model.feature_importances_}, index=X_train.columns).sort_values('importance')
feat_df[-50:].plot.barh(figsize=(20, 15))
Esempio n. 25
0
result = bayes_cv_tuner.fit(X.values, y.values, callback=status_print)

# ## Example 3: Different cross-validators
# Some people have asked about CV strategy, and as seen I've just used the basic Stratified K-fold strategy; that was however mostly due to time constraints, and thus me not thinking that much about it. There are a lot of potentially better options, especially considering the temporal nature of this problem. Adding these are really easy using scikit-learn cross-validators; you just plug-n-play a new cross-validator into the `cv = ` options of BayesSearchCV. Examples could be a single train-test split, where we e.g. use one day for training, and one for testing (adjust accordingly):

# In[6]:

from sklearn.model_selection import PredefinedSplit

# Training [index == -1], testing [index == 0])
test_fold = np.zeros(len(X))
test_fold[:(TRAINING_SIZE - TEST_SIZE)] = -1
cv = PredefinedSplit(test_fold)

# Check that we only have a single train-test split, and the size
train_idx, test_idx = next(cv.split())
print(
    f"Splits: {cv.get_n_splits()}, Train size: {len(train_idx)}, Test size: {len(test_idx)}"
)

# Alternatively, we could want to use the [TimeSeriesSplit](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html#sklearn.model_selection.TimeSeriesSplit) cross-validator, which allows us to do several "into the future folds" for predictions

# In[14]:

from sklearn.model_selection import TimeSeriesSplit

# Here we just do 3-fold timeseries CV
cv = TimeSeriesSplit(max_train_size=None, n_splits=3)

# Let us check the sizes of the folds. Note that you can keep train size constant with max_train_size if needed
for i, (train_index, test_index) in enumerate(cv.split(X)):
Esempio n. 26
0
    ps_clf = PredefinedSplit(test_fold=[
        0 if i in val_idx else -1 for i in sorted(train_or_val_idx)
    ])
    ps_reg = [
        PredefinedSplit(test_fold=[
            0 if k in val_idx else -1 for k in sorted(
                set(idx_by_class[c]).intersection(set(train_or_val_idx)))
        ]) for c in classes
    ]

    # Construct grid search cross validation to select the best classifier given the validation set
    clf = GridSearchCV(LinearSVC(max_iter=1e9),
                       parameters_clf,
                       scoring='accuracy',
                       refit=True,
                       cv=list(ps_clf.split()))

    # Construct grid search cross validation to select the best regressors given the validation set
    reg = [
        GridSearchCV(LinearSVR(loss='squared_epsilon_insensitive',
                               max_iter=1e9),
                     parameters_reg,
                     cv=list(ps_reg[i].split()),
                     scoring=make_scorer(EVAL_SCORE, greater_is_better=False),
                     n_jobs=4,
                     refit=True) for i, _ in enumerate(classes)
    ]

    # Train the classifier model
    clf.fit(X_tv, y_tv)
    #clf.score(X_tv, y_tv)
Esempio n. 27
0
def ada(X, Y, kfold=3, feature_set=None):

    arr = index_splitter(N=len(X), fold=kfold)
    ps = PredefinedSplit(arr)

    for train, test in ps.split():
        train_index = train
        test_index = test

    train_X, train_y = X.values[train_index, :], Y.values[train_index]
    test_X, test_y = X.values[test_index, :], Y.values[test_index]
    arr = index_splitter(N=len(train_X), fold=kfold)
    ps2 = PredefinedSplit(arr)

    learning_rate = [x for x in np.linspace(0.1, 1, num=10)]
    n_estimators = [int(x) for x in np.linspace(start=20, stop=1000, num=100)]
    loss = ['square']

    random_grid = {
        'n_estimators': n_estimators,
        'learning_rate': learning_rate,
        'loss': loss
    }

    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    ada = AdaBoostRegressor(random_state=42, loss='square')

    # Look at parameters used by our current forest
    print('Parameters for baseline:\n')
    pprint(ada.get_params())

    # Random search of parameters, using 3 fold cross validation,
    # search across 100 different combinations, and use all available cores
    ada_random = RandomizedSearchCV(estimator=ada,
                                    n_iter=200,
                                    param_distributions=random_grid,
                                    scoring='neg_mean_squared_error',
                                    cv=ps2.split(),
                                    verbose=2,
                                    random_state=42,
                                    n_jobs=-1)

    # Fit the random search model
    ada_random.fit(train_X, train_y)
    pprint(ada_random.best_params_)

    cv_result_rd = ada_random.cv_results_
    BestPara_random = ada_random.best_params_

    ## Grid search of parameters, using 3 fold cross validation based on Random search
    lr = [BestPara_random['learning_rate']]
    #n_estimators = [BestPara_random["n_estimators"]]

    n_estimators = [
        int(x) for x in range(BestPara_random["n_estimators"] -
                              10, BestPara_random["n_estimators"] + 10, 20)
    ]
    n_estimators = [item for item in n_estimators if item > 0]

    grid_grid = {
        'n_estimators': n_estimators,
        'learning_rate': lr,
        'loss': loss
    }

    ada_grid = GridSearchCV(estimator=ada,
                            param_grid=grid_grid,
                            scoring='neg_mean_squared_error',
                            cv=ps2.split(),
                            verbose=2,
                            n_jobs=-1)

    # Fit the grid search model
    ada_grid.fit(train_X, train_y)
    BestPara_grid = ada_grid.best_params_

    pprint(ada_grid.best_params_)
    cv_results_grid = ada_grid.cv_results_

    # Fit the base line search model
    ada.fit(train_X, train_y)

    #prediction
    predict_y = ada_random.predict(test_X)
    predict_y_grid = ada_grid.predict(test_X)
    predict_y_base = ada.predict(test_X)

    # Performance metrics

    def RMLSE(predict_y_grid, predict_y, predict_y_base, test_y):
        errors_Grid_CV = np.sqrt(mean_squared_log_error(
            predict_y_grid, test_y))
        errors_Random_CV = np.sqrt(mean_squared_log_error(predict_y, test_y))
        errors_baseline = np.sqrt(
            mean_squared_log_error(predict_y_base, test_y))
        return errors_Grid_CV, errors_Random_CV, errors_baseline

    errors_Grid_CV = (mean_squared_error(predict_y_grid,
                                         test_y))  #,squared = False))
    errors_Random_CV = (mean_squared_error(predict_y,
                                           test_y))  #,squared = False))
    errors_baseline = (mean_squared_error(predict_y_base,
                                          test_y))  #,squared = False))

    x_axis = range(3)
    results = [errors_Grid_CV, errors_Random_CV, errors_baseline]

    print('Adaboot Results:', results)

    if True:
        fig = plt.figure(figsize=(15, 8))
        x_axis = range(3)
        plt.bar(x_axis, results)
        plt.xticks(x_axis, ('GridSearchCV', 'RandomizedSearchCV', 'Baseline'))
        #plt.show()
        plt.savefig('ada_compare_error.png')

        #feature importance
        num_feature = len(ada_grid.best_estimator_.feature_importances_)
        plt.figure(figsize=(24, 6))
        plt.bar(range(0, num_feature * 4, 4),
                ada_grid.best_estimator_.feature_importances_)

        label_name = X.keys()

        plt.xticks(range(0, num_feature * 4, 4), label_name)
        plt.title("Feature Importances" + ",kfold=" + str(kfold))
        #plt.show()
        plt.savefig('ada_feature_importance.png')

        fig = plt.figure(figsize=(20, 8))
        ax = fig.gca()
        x_label = range(0, len(predict_y_grid))
        plt.title("kfold=" + str(kfold))
        ax.plot(x_label, predict_y_grid, 'r--', label="predict")
        ax.plot(x_label, test_y, label="ground_truth")
        ax.set_ylim(0, 200)
        ax.legend()
        #plt.show()
        plt.savefig('ada_prediction.png')

        #return a dictionary for all results
    return ada_grid.predict, ada_grid.best_estimator_
Esempio n. 28
0

########search for parameters using own scoring function
def my_own_scorer(clf, X, y_true):
    class_labels = clf.classes_
    loss = log_loss(y_true,clf.predict_proba(X),class_labels)
    return loss

Cs = [0.1,0.3,0.5,1,3,5]
tols = [0.01,0.03]


gs = GridSearchCV(
        estimator=LogisticRegression(random_state=0), ######machine learning algorithm
        param_grid={'C': Cs,'tol':tols},#####list of parameters to search for
        cv=ps.split(), #######evaluate performance on training and validation set; in this case, we are using a predefined training and validation set
        verbose=True,
        scoring=my_own_scorer######evaluate the performance using this scoring function
        )

model = gs.fit(X,y)

print(pd.DataFrame(model.cv_results_) )#####results are available in here



########search for parameters using sklearn predefined scorer
Cs = [0.1,0.3,0.5,1,3,5]
tols = [0.01,0.03]

Esempio n. 29
0
def get_dataloaders(dataset, batch, dataroot, split=0.15, split_idx=0, multinode=False, target_lb=-1, gr_assign=None, gr_id=None, gr_ids=None, rand_val=False):
    if 'cifar' in dataset or 'svhn' in dataset:
        if "cifar" in dataset:
            _mean, _std = _CIFAR_MEAN, _CIFAR_STD
        else:
            _mean, _std = _SVHN_MEAN, _SVHN_STD
        transform_train = transforms.Compose([
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize(_mean, _std),
        ])
        transform_test = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(_mean, _std),
        ])
    elif 'imagenet' in dataset:
        input_size = 224
        sized_size = 256

        if 'efficientnet' in C.get()['model']['type']:
            input_size = EfficientNet.get_image_size(C.get()['model']['type'])
            sized_size = input_size + 32    # TODO
            # sized_size = int(round(input_size / 224. * 256))
            # sized_size = input_size
            logger.info('size changed to %d/%d.' % (input_size, sized_size))

        transform_train = transforms.Compose([
            EfficientNetRandomCrop(input_size),
            transforms.Resize((input_size, input_size), interpolation=Image.BICUBIC),
            # transforms.RandomResizedCrop(input_size, scale=(0.1, 1.0), interpolation=Image.BICUBIC),
            transforms.RandomHorizontalFlip(),
            transforms.ColorJitter(
                brightness=0.4,
                contrast=0.4,
                saturation=0.4,
            ),
            transforms.ToTensor(),
            Lighting(0.1, _IMAGENET_PCA['eigval'], _IMAGENET_PCA['eigvec']),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

        transform_test = transforms.Compose([
            EfficientNetCenterCrop(input_size),
            transforms.Resize((input_size, input_size), interpolation=Image.BICUBIC),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    else:
        raise ValueError('dataset=%s' % dataset)

    if isinstance(C.get()['aug'], list):
        logger.debug('augmentation provided.')
        transform_train.transforms.insert(0, Augmentation(C.get()['aug']))
    elif isinstance(C.get()['aug'], dict):
        # group version
        logger.debug('group augmentation provided.')
    else:
        logger.debug('augmentation: %s' % C.get()['aug'])
        if C.get()['aug'] == 'fa_reduced_cifar10':
            transform_train.transforms.insert(0, Augmentation(fa_reduced_cifar10()))

        elif C.get()['aug'] == 'fa_reduced_imagenet':
            transform_train.transforms.insert(0, Augmentation(fa_resnet50_rimagenet()))

        elif C.get()['aug'] == 'fa_reduced_svhn':
            transform_train.transforms.insert(0, Augmentation(fa_reduced_svhn()))

        elif C.get()['aug'] == 'arsaug':
            transform_train.transforms.insert(0, Augmentation(arsaug_policy()))
        elif C.get()['aug'] == 'autoaug_cifar10':
            transform_train.transforms.insert(0, Augmentation(autoaug_paper_cifar10()))
        elif C.get()['aug'] == 'autoaug_extend':
            transform_train.transforms.insert(0, Augmentation(autoaug_policy()))
        elif C.get()['aug'] in ['default', "clean", "nonorm", "nocut"]:
            pass
        else:
            raise ValueError('not found augmentations. %s' % C.get()['aug'])

    if C.get()['cutout'] > 0 and C.get()['aug'] != "nocut":
        transform_train.transforms.append(CutoutDefault(C.get()['cutout']))
    if C.get()['aug'] == "clean":
        transform_train = transform_test
    elif C.get()['aug'] == "nonorm":
        transform_train = transforms.Compose([
            transforms.ToTensor()
        ])
    train_idx = valid_idx = None
    if dataset == 'cifar10':
        if isinstance(C.get()['aug'], dict):
            total_trainset = GrAugCIFAR10(root=dataroot, gr_assign=gr_assign, gr_policies=C.get()['aug'], train=True, download=False, transform=transform_train)
        else:
            total_trainset = torchvision.datasets.CIFAR10(root=dataroot, train=True, download=False, transform=transform_train)
        testset = torchvision.datasets.CIFAR10(root=dataroot, train=False, download=False, transform=transform_test)
    elif dataset == 'reduced_cifar10':
        if isinstance(C.get()['aug'], dict):
            total_trainset = GrAugCIFAR10(root=dataroot, gr_assign=gr_assign, gr_policies=C.get()['aug'], train=True, download=False, transform=transform_train)
        else:
            total_trainset = torchvision.datasets.CIFAR10(root=dataroot, train=True, download=False, transform=transform_train)
        sss = StratifiedShuffleSplit(n_splits=5, train_size=4000, random_state=0)   # 4000 trainset
        sss = sss.split(list(range(len(total_trainset))), total_trainset.targets)
        for _ in range(split_idx+1):
            train_idx, valid_idx = next(sss)

        testset = torchvision.datasets.CIFAR10(root=dataroot, train=False, download=False, transform=transform_test)
    elif dataset == 'cifar100':
        if isinstance(C.get()['aug'], dict):
            total_trainset = GrAugData("CIFAR100", root=dataroot, gr_assign=gr_assign, gr_policies=C.get()['aug'], train=True, download=False, transform=transform_train)
        else:
            total_trainset = torchvision.datasets.CIFAR100(root=dataroot, train=True, download=False, transform=transform_train)
        testset = torchvision.datasets.CIFAR100(root=dataroot, train=False, download=False, transform=transform_test)
    elif dataset == 'svhn': #TODO
        trainset = torchvision.datasets.SVHN(root=dataroot, split='train', download=False, transform=transform_train)
        extraset = torchvision.datasets.SVHN(root=dataroot, split='extra', download=False, transform=transform_train)
        total_trainset = ConcatDataset([trainset, extraset])
        testset = torchvision.datasets.SVHN(root=dataroot, split='test', download=False, transform=transform_test)
    elif dataset == 'reduced_svhn':
        if isinstance(C.get()['aug'], dict):
            total_trainset = GrAugData("SVHN", root=dataroot, gr_assign=gr_assign, gr_policies=C.get()['aug'], split='train', download=False, transform=transform_train)
        else:
            total_trainset = torchvision.datasets.SVHN(root=dataroot, split='train', download=False, transform=transform_train)
        sss = StratifiedShuffleSplit(n_splits=5, train_size=1000, test_size=7325, random_state=0)
        sss = sss.split(list(range(len(total_trainset))), total_trainset.labels)
        for _ in range(split_idx+1):
            train_idx, valid_idx = next(sss)
        # targets = [total_trainset.labels[idx] for idx in train_idx]
        # total_trainset = Subset(total_trainset, train_idx)
        # total_trainset.targets = targets

        testset = torchvision.datasets.SVHN(root=dataroot, split='test', download=False, transform=transform_test)
    elif dataset == 'imagenet':
        total_trainset = ImageNet(root=os.path.join(dataroot, 'imagenet-pytorch'), transform=transform_train)
        testset = ImageNet(root=os.path.join(dataroot, 'imagenet-pytorch'), split='val', transform=transform_test)

        # compatibility
        total_trainset.targets = [lb for _, lb in total_trainset.samples]
    elif dataset == 'reduced_imagenet':
        # randomly chosen indices
        # idx120 = sorted(random.sample(list(range(1000)), k=120))
        idx120 = [16, 23, 52, 57, 76, 93, 95, 96, 99, 121, 122, 128, 148, 172, 181, 189, 202, 210, 232, 238, 257, 258, 259, 277, 283, 289, 295, 304, 307, 318, 322, 331, 337, 338, 345, 350, 361, 375, 376, 381, 388, 399, 401, 408, 424, 431, 432, 440, 447, 462, 464, 472, 483, 497, 506, 512, 530, 541, 553, 554, 557, 564, 570, 584, 612, 614, 619, 626, 631, 632, 650, 657, 658, 660, 674, 675, 680, 682, 691, 695, 699, 711, 734, 736, 741, 754, 757, 764, 769, 770, 780, 781, 787, 797, 799, 811, 822, 829, 830, 835, 837, 842, 843, 845, 873, 883, 897, 900, 902, 905, 913, 920, 925, 937, 938, 940, 941, 944, 949, 959]
        total_trainset = ImageNet(root=os.path.join(dataroot, 'imagenet-pytorch'), transform=transform_train)
        testset = ImageNet(root=os.path.join(dataroot, 'imagenet-pytorch'), split='val', transform=transform_test)

        # compatibility
        total_trainset.targets = [lb for _, lb in total_trainset.samples]

        sss = StratifiedShuffleSplit(n_splits=1, test_size=len(total_trainset) - 50000, random_state=0)  # 4000 trainset
        sss = sss.split(list(range(len(total_trainset))), total_trainset.targets)
        train_idx, valid_idx = next(sss)

        # filter out
        train_idx = list(filter(lambda x: total_trainset.labels[x] in idx120, train_idx))
        valid_idx = list(filter(lambda x: total_trainset.labels[x] in idx120, valid_idx))
        test_idx = list(filter(lambda x: testset.samples[x][1] in idx120, range(len(testset))))

        targets = [idx120.index(total_trainset.targets[idx]) for idx in train_idx]
        for idx in range(len(total_trainset.samples)):
            if total_trainset.samples[idx][1] not in idx120:
                continue
            total_trainset.samples[idx] = (total_trainset.samples[idx][0], idx120.index(total_trainset.samples[idx][1]))
        total_trainset = Subset(total_trainset, train_idx)
        total_trainset.targets = targets

        for idx in range(len(testset.samples)):
            if testset.samples[idx][1] not in idx120:
                continue
            testset.samples[idx] = (testset.samples[idx][0], idx120.index(testset.samples[idx][1]))
        testset = Subset(testset, test_idx)
        print('reduced_imagenet train=', len(total_trainset))
    elif dataset == "cifar10_svhn":
        if isinstance(C.get()['aug'], dict):
            # last stage: benchmark test
            total_trainset = GrAugMix(dataset.split("_"), gr_assign=gr_assign, gr_policies=C.get()['aug'], root=dataroot, train=True, download=False, transform=transform_train, gr_ids=gr_ids)
        else:
            # eval_tta & childnet training
            total_trainset = GrAugMix(dataset.split("_"), root=dataroot, train=True, download=False, transform=transform_train)
        testset = GrAugMix(dataset.split("_"), root=dataroot, train=False, download=False, transform=transform_test)
    else:
        raise ValueError('invalid dataset name=%s' % dataset)

    if not hasattr(total_trainset, "gr_ids"):
        total_trainset.gr_ids = None
    if gr_ids is not None:
        total_trainset.gr_ids = gr_ids
    if gr_assign is not None and total_trainset.gr_ids is None:
        # eval_tta3
        temp_trainset = copy.deepcopy(total_trainset)
        # temp_trainset.transform = transform_test # just normalize
        temp_loader = torch.utils.data.DataLoader(
        temp_trainset, batch_size=batch, shuffle=False, num_workers=4,
        drop_last=False)
        gr_dist = gr_assign(temp_loader)
        gr_ids = torch.max(gr_dist)[1].numpy()

    if split > 0.0:
        if train_idx is None or valid_idx is None:
            # filter by split ratio
            sss = StratifiedShuffleSplit(n_splits=5, test_size=split, random_state=0)
            sss = sss.split(list(range(len(total_trainset))), total_trainset.targets)
            for _ in range(split_idx + 1):
                train_idx, valid_idx = next(sss)

        if gr_id is not None:
            # filter by group
            idx2gr = total_trainset.gr_ids
            ps = PredefinedSplit(idx2gr)
            ps = ps.split()
            for _ in range(gr_id + 1):
                _, gr_split_idx = next(ps)
            train_idx = [idx for idx in train_idx if idx in gr_split_idx]
            valid_idx = [idx for idx in valid_idx if idx in gr_split_idx]

        if target_lb >= 0:
            train_idx = [i for i in train_idx if total_trainset.targets[i] == target_lb]
            valid_idx = [i for i in valid_idx if total_trainset.targets[i] == target_lb]

        train_sampler = SubsetRandomSampler(train_idx)
        valid_sampler = SubsetSampler(valid_idx) if not rand_val else SubsetRandomSampler(valid_idx)

        if multinode:
            train_sampler = torch.utils.data.distributed.DistributedSampler(Subset(total_trainset, train_idx), num_replicas=dist.get_world_size(), rank=dist.get_rank())
    else:
        train_sampler = None
        valid_sampler = SubsetSampler([])

        if gr_id is not None:
            # filter by group
            idx2gr = total_trainset.gr_ids
            ps = PredefinedSplit(idx2gr)
            ps = ps.split()
            for _ in range(gr_id + 1):
                _, gr_split_idx = next(ps)
            targets = [total_trainset.targets[idx] for idx in gr_split_idx]
            total_trainset = Subset(total_trainset, gr_split_idx)
            total_trainset.targets = targets

        if train_idx is not None and valid_idx is not None:
            if dataset in ["svhn", "reduced_svhn"]:
                targets = [total_trainset.labels[idx] for idx in train_idx]
            else:
                targets = [total_trainset.targets[idx] for idx in train_idx]
            total_trainset = Subset(total_trainset, train_idx)
            total_trainset.targets = targets

        if multinode:
            train_sampler = torch.utils.data.distributed.DistributedSampler(total_trainset, num_replicas=dist.get_world_size(), rank=dist.get_rank())
            logger.info(f'----- dataset with DistributedSampler  {dist.get_rank()}/{dist.get_world_size()}')


    trainloader = torch.utils.data.DataLoader(
        total_trainset, batch_size=batch, shuffle=True if train_sampler is None else False, num_workers=8 if torch.cuda.device_count()==8 else 4, pin_memory=True,
        sampler=train_sampler, drop_last=True)
    validloader = torch.utils.data.DataLoader(
        total_trainset, batch_size=batch, shuffle=False, num_workers=4, pin_memory=True,
        sampler=valid_sampler, drop_last=False if not rand_val else True)
    testloader = torch.utils.data.DataLoader(
        testset, batch_size=batch, shuffle=False, num_workers=8 if torch.cuda.device_count()==8 else 4, pin_memory=True,
        drop_last=False
    )
    return train_sampler, trainloader, validloader, testloader
print('y')
print(y)
print()
print()
print('------------------------------')

########manually splitting into training and validation set
test_fold = [-1] * 6 + [0] * 4
#-1 indicates that index will belong to the training set
#in this case, the first 6 records will belong to the training set
#others will belong to validation set
ps = PredefinedSplit(test_fold)

#########splitted
print('splitted training and validation set')
for train_index, test_index in ps.split():
    print('train_index', train_index)
    print('X_train (first 6 records of X)')
    print(X[train_index])
    print('y_train (first 6 records of y)')
    print(y[train_index])

    print()
    print()

    print('test_index', test_index)
    print('X_test (last 4 records of X)')
    print(X[test_index])
    print('y_test (last 4 records of y)')
    print(y[test_index])
Esempio n. 31
0
                                errors='ignore')
    df = df.set_index(pd.DatetimeIndex(df['time']))
    X = df.drop(columns=[
        df.keys()[0], 'sbi', 'bemp', 'time', 'sbi_1h', 'sbi_2h', 'sbi_3h',
        'sbi_4h', 'sbi_5h', 'sbi_6h', 'sbi_7h', 'sbi_8h', 'sbi_9h', 'sbi_10h',
        'sbi_11h', 'sbi_12h', 'sbi_1d', 'sbi_2d', 'sbi_3d', 'sbi_4d', 'sbi_5d',
        'sbi_6d', 'sbi_7d', 'y_sbi'
    ])
    Y = df['bemp']
    print(X.columns)

    # Data Splitter
    arr = index_splitter(N=len(X), fold=3)
    ps = PredefinedSplit(arr)

    for train, test in ps.split():
        train_index = train
        test_index = test

    train_X, train_y = X.iloc[train_index, :], Y.iloc[train_index]
    test_X, test_y = X.iloc[test_index, :], Y.iloc[test_index]

    if True:
        regressor = LinearRegression()
        regressor.fit(train_X, train_y)
        pickle.dump(regressor, open('bemp_model.pkl', 'wb'))

    model = pickle.load(open('bemp_model.pkl', 'rb'))

    print(model.predict(test_X))
Esempio n. 32
0
def lasso(X, Y, kfold=3, feature_set=None):
    arr = index_splitter(N=len(X), fold=kfold)
    ps = PredefinedSplit(arr)

    for train, test in ps.split():
        train_index = train
        test_index = test

    train_X, train_y = X.values[train_index, :], Y.values[train_index]
    test_X, test_y = X.values[test_index, :], Y.values[test_index]
    arr = index_splitter(N=len(train_X), fold=kfold)
    ps2 = PredefinedSplit(arr)

    # Create the random grid
    alpha = np.linspace(0, 1, 10)
    random_grid = {'alpha': alpha}

    lasso = Lasso(random_state=42)

    # Look at parameters used by our current forest
    print('Parameters currently in use:\n')
    pprint(lasso.get_params())

    # Use the random grid to search for best hyperparameters
    # First create the base model to tune

    # Random search of parameters, using 3 fold cross validation,
    # search across 100 different combinations, and use all available cores
    lasso_random = RandomizedSearchCV(estimator=lasso,
                                      param_distributions=random_grid,
                                      scoring='neg_mean_squared_error',
                                      cv=ps2.split(),
                                      verbose=2,
                                      random_state=42,
                                      n_jobs=-1)

    # Fit the random search model
    lasso_random.fit(train_X, train_y)
    pprint(lasso_random.best_params_)

    cv_result_rd = lasso_random.cv_results_

    BestPara_random = lasso_random.best_params_

    ## Grid search of parameters, using 3 fold cross validation based on Random search
    from sklearn.model_selection import GridSearchCV

    # Number of trees in random forest
    alpha = np.linspace(BestPara_random["alpha"] - 0.2,
                        BestPara_random["alpha"] + 0.2, 10)

    # Create the random grid
    grid_grid = {'alpha': alpha}

    lasso_grid = GridSearchCV(estimator=lasso,
                              param_grid=grid_grid,
                              scoring='neg_mean_squared_error',
                              cv=ps2.split(),
                              verbose=2,
                              n_jobs=-1)
    # Fit the grid search model
    lasso_grid.fit(train_X, train_y)
    BestPara_grid = lasso_grid.best_params_

    pprint(lasso_grid.best_params_)
    cv_results_grid = lasso_grid.cv_results_

    # Fit the base line search model
    lasso.fit(train_X, train_y)

    #prediction
    predict_y = lasso_random.predict(test_X)
    predict_y_grid = lasso_grid.predict(test_X)
    predict_y_base = lasso.predict(test_X)

    def RMLSE(predict_y_grid, predict_y, predict_y_base, test_y):
        errors_Grid_CV = np.sqrt(mean_squared_log_error(
            predict_y_grid, test_y))
        errors_Random_CV = np.sqrt(mean_squared_log_error(predict_y, test_y))
        errors_baseline = np.sqrt(
            mean_squared_log_error(predict_y_base, test_y))
        return errors_Grid_CV, errors_Random_CV, errors_baseline

    errors_Grid_CV = (mean_squared_error(predict_y_grid,
                                         test_y))  #,squared = False))
    errors_Random_CV = (mean_squared_error(predict_y,
                                           test_y))  #,squared = False))
    errors_baseline = (mean_squared_error(predict_y_base,
                                          test_y))  #,squared = False))
    results = [errors_Grid_CV, errors_Random_CV, errors_baseline]

    print('lasso results:', results)

    if True:

        fig = plt.figure(figsize=(20, 8))
        x_axis = range(3)
        plt.bar(x_axis, results)
        plt.xticks(x_axis, ('GridSearchCV', 'RandomizedSearchCV', 'Baseline'))
        #plt.show()
        plt.savefig('lasso_error_compare.png')

        #feature importance
        #num_feature = len(lasso.best_estimator_.feature_importances_)
        #plt.figure(figsize=(24,6))
        #plt.bar(range(0,num_feature*4,4),lasso.best_estimator_.feature_importances_)
        #label_name = X.keys()
        #plt.xticks(range(0,num_feature*4,4), label_name)
        #plt.title("Feature Importances"+",kfold="+str(kfold))
        #plt.show()
        #plt.savefig('lasso_feature_importance.png')

        fig = plt.figure(figsize=(20, 8))
        ax = fig.gca()
        x_label = range(0, len(predict_y_grid))
        plt.title("kfold=" + str(kfold))
        ax.plot(x_label, predict_y_grid, 'r--', label="predict")
        ax.plot(x_label, test_y, label="ground_truth")
        ax.set_ylim(0, 200)
        ax.legend()
        #plt.show()
        plt.savefig('lasso_prediction.png')

    return lasso_grid.predict, lasso_grid.best_estimator_