Example #1
0
def test_jdata_fscorer_class():
    monkey_patch.run()

    user_sku_pair, data, target, pred_proba, test_fold, expected_scores = (
        get_jdata_test_cases())

    pred_map = {}
    clf = MockEstimatorWithPredefinedPrediction(pred_map)
    ps = PredefinedSplit(test_fold)

    for train_index, test_index in ps.split():
        print("TRAIN:", train_index, "TEST:", test_index)
        clf.set(data[train_index, :], pred_proba[train_index])
        clf.set(data[test_index, :], pred_proba[test_index])

    scoring = {
        "custom_score_index": JDataScore(),
        "custom_score_index_with_user_sku_pair": JDataScore(user_sku_pair),
    }
    scores = cross_validate(clf,
                            data,
                            target,
                            scoring=scoring,
                            cv=ps,
                            return_estimator=True)

    for name in scoring.keys():
        assert_almost_equal(scores[f"test_{name}"], expected_scores)
Example #2
0
def nested_cv(X, y, estimator, scorer, param_grid, num_trials=10, n_splits=3, n_high=5, random_state=42*31415):

    groups = group_samples_by_threshold(y, [1e3, 1e5])

    # Data Storage for CV Scores
    cv_scores = []

    # Arrays to store scores
    nested_scores = np.full(num_trials, -np.Inf)
    # Best regression model (return value)
    rg_best = None

    for i in tqdm(range(num_trials)):
        seed = i * random_state

        inner_cv = PredefinedSplit(split_keep_n_high_grouped(y, groups, folds=n_splits, n_high=n_high, random_state=seed))
        outer_cv = PredefinedSplit(split_keep_n_high_grouped(y, groups, folds=n_splits, n_high=n_high, random_state =seed))

        # Non_nested parameter search and scoring
        rg = GridSearchCV(estimator=estimator, param_grid=param_grid,
                          iid=False, cv=inner_cv, scoring=scorer, return_train_score=True)
        rg.fit(X, y)

        # Nested CV with parameter optimization
        nested_score = cross_val_score(rg.best_estimator_, X=X, y=y, cv=outer_cv, scoring=scorer)

        nested_scores[i] = nested_score.mean()
        if nested_scores.max() == nested_scores[i]:
            rg_best = rg.best_estimator_

        cv_scores.append({'gs_scores':pd.DataFrame(rg.cv_results_).sort_values('mean_test_score')[['params', 'mean_test_score']], 'ns_scores':nested_score})

    return rg_best, cv_scores
Example #3
0
 def __init__(self, X, p=5, n_splits=2):
     self.X = X
     self.p = p
     self.n_splits = n_splits
     test_fold = self.X.groupby("user_id").cumcount().apply(
         lambda x: int(x / p) if x < (n_splits * p) else -1)
     self.s = PredefinedSplit(test_fold)
Example #4
0
def _load(collection, name, dirname=None):
    """Load dataset."""
    filename = _fetch_partition(collection, name, '', dirname=dirname)
    filename_tr = _fetch_partition(collection, name, '.tr', dirname=dirname)
    filename_val = _fetch_partition(collection, name, '.val', dirname=dirname)
    filename_t = _fetch_partition(collection, name, '.t', dirname=dirname)
    filename_r = _fetch_partition(collection, name, '.r', dirname=dirname)
    if (filename_tr is not None) and (filename_val
                                      is not None) and (filename_t
                                                        is not None):
        _, _, X_tr, y_tr, X_val, y_val, X_test, y_test = load_svmlight_files(
            [filename, filename_tr, filename_val, filename_t])
        cv = PredefinedSplit([-1] * X_tr.shape[0] + [0] * X_val.shape[0])
        X = sp.sparse.vstack((X_tr, X_val))
        y = np.hstack((y_tr, y_val))
        X_remaining = y_remaining = None
    elif (filename_tr is not None) and (filename_val is not None):
        _, _, X_tr, y_tr, X_val, y_val = load_svmlight_files(
            [filename, filename_tr, filename_val])
        cv = PredefinedSplit([-1] * X_tr.shape[0] + [0] * X_val.shape[0])
        X = sp.sparse.vstack((X_tr, X_val))
        y = np.hstack((y_tr, y_val))
        X_test = y_test = X_remaining = y_remaining = None
    elif (filename_t is not None) and (filename_r is not None):
        X, y, X_test, y_test, X_remaining, y_remaining = load_svmlight_files(
            [filename, filename_t, filename_r])
        cv = None
    elif filename_t is not None:
        X, y, X_test, y_test = load_svmlight_files([filename, filename_t])
        X_remaining = y_remaining = cv = None
    else:
        X, y = load_svmlight_file(filename)
        X_test = y_test = X_remaining = y_remaining = cv = None
    return X, y, X_test, y_test, cv, X_remaining, y_remaining
Example #5
0
def rolling_model_PLS(df_X, df_Y):
    split_num = 200 * 60
    X_traindata = df_X[:split_num * 2]
    Y_traindata = df_Y[:split_num * 2]
    X_vdata = df_X[split_num:split_num * 2]
    X_testdata = df_X[split_num * 2:split_num * 3]
    Y_testdata = df_Y[split_num * 2:split_num * 3]

    # specify parameters and distributions to sample from

    num_valid_size = len(X_traindata) - len(X_vdata)
    test_fold = -1 * np.ones(len(X_traindata))
    test_fold[num_valid_size:] = 0
    ps = PredefinedSplit(test_fold)

    # specify parameters and distributions to sample from
    param_dist = {'n_components': sp_randint(1, 100), 'max_iter': sp_randint(50, len(X_traindata)),
                  'tol': [0.0001, 0.00001, 0.000001, 0.0000001]}
    # param_dist = {'n_components':[3,4]}
    PLS_model = PLSRegression(scale=False)

    # run gridsearchcv make_scorer(r2_score)
    n_iter_search = 50
    estim = RandomizedSearchCV(PLS_model, param_distributions=param_dist, scoring='r2',
                               cv=ps.split(), iid=False, n_jobs=1, n_iter=n_iter_search)

    estim.fit(X_traindata, Y_traindata)
    best_estimator = estim.best_estimator_
    v_pred = best_estimator.predict(df_X[:split_num])
    v_performance_score = r2_score(df_Y[:split_num], v_pred)
    test_pre_y_array = best_estimator.predict(X_testdata)
    test_performance_score = r2_score(Y_testdata, test_pre_y_array)

    return v_performance_score, test_performance_score
Example #6
0
def decode(X, y, cv_ids, model):
    """
    Parameters
    --------------
    X: np.array, n_stimuli x n_voxels
    y: np.array, n_stimuli, 
    cv_ids: np.array - n_stimuli, 
    
    Return
    --------------
    models, scores
    """
    scores = []
    models = []
    ps = PredefinedSplit(cv_ids)
    for train_index, test_index in ps.split():
        # split the data
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # fit the model on the training set
        model.fit(X_train, y_train)
        # calculate the accuracy for the hold out run
        score = model.score(X_test, y_test)
        # save stuff
        models.append(deepcopy(model))
        scores.append(score)
    return models, scores
def test_predefined_split():
    cv = PredefinedSplit(np.array(list(range(4)) * 5))
    cv2 = PredefinedSplit(np.array(list(range(5)) * 4))
    assert tokenize(cv) == tokenize(cv)
    assert tokenize(cv) != tokenize(cv2)

    sol = cv.get_n_splits(np_X, np_y, np_groups)
    assert compute_n_splits(cv, np_X, np_y, np_groups) == sol

    with assert_dask_compute(False):
        assert compute_n_splits(cv, da_X, da_y, da_groups) == sol
Example #8
0
def main(argv):
    start_time = datetime.now()
    logger.info("START")
    args = argparser.parse_args()
    inFile = args.inFile
    testFile = args.testFile
    nameModel = args.nameModel
    conf_file = args.mod
    mod = __import__(conf_file, fromlist=['*'])
    model_conf = mod.gridSearch_Model_types[nameModel]
    conf = getattr(__import__(conf_file, fromlist=[model_conf]), model_conf)
    prefix_dict = conf['prefix_dict']
    out_dict = h.outfileName(fo=args.outFile,
                             fi=inFile,
                             prefix_dict=prefix_dict,
                             add_date=True)
    logger.info("RUNNING WITH MOD: %s, INFILE: %s" % (conf_file, inFile))
    logger.info("LOADING THE DATA SET")
    param_grid = PARAM_DICT[nameModel]
    # scoring = {'Accuracy': make_scorer(accuracy_score),'RMS':make_scorer(mean_squared_error)}
    scoring = {'RMS': make_scorer(r2_score)}
    X, Y, len_train, numFeatures = readFile(inFile)
    cv = None
    if testFile:
        logger.info("USING TEST FILE %s AS TEST SET FOR THE CORSS VALIDATION" %
                    testFile)
        X_test, Y_test, len_train_test, numFeatures_test = readFile(inFile)
        X = pd.concat([X, X_test], ignore_index=True)
        Y = pd.concat([Y, Y_test], ignore_index=True)
        cv_arr = [1] * len_train
        cv_arr.extend([0] * len_train_test)
        cv = PredefinedSplit(test_fold=cv_arr)
        print("Stampa di cv: ", cv)
        print("numero di fold", cv.get_n_splits())
        for train_index, test_index in cv.split():
            print("TRAIN:", train_index, "TEST:", test_index)
        logger.info("SHAPE OF X:%s AND Y:%s AFTER APPEND", X.shape, Y.shape)
    logger.info("CREATION OF THE MODEL")
    t = TestClass(conf=conf, nm=nameModel, nf=numFeatures)
    if nameModel == 'NN':
        model = KerasClassifier(build_fn=t.createModelNN)
        X = X.as_matrix()
        Y = Y.as_matrix()
    else:
        model = t.selectModel()
    logger.info("START GRID SEARCH")
    grid_result = gridSearch(model, param_grid, cv, X, Y, scoring)
    logger.info("END OF GRID SEARCH")
    logger.info("PRINTING RESULTS")
    gridResults(grid_result, X, nameModel)
    SaveModel(nameModel, grid_result)
    logger.info("EXECUTED IN %f SEC" %
                ((datetime.now() - start_time)).total_seconds())
    logger.info("END")
Example #9
0
def test_predefined_split():
    cv = PredefinedSplit(np.array(list(range(4)) * 5))
    cv2 = PredefinedSplit(np.array(list(range(5)) * 4))
    assert tokenize(cv) == tokenize(cv)
    assert tokenize(cv) != tokenize(cv2)

    sol = cv.get_n_splits(np_X, np_y, np_groups)
    assert compute_n_splits(cv, np_X, np_y, np_groups) == sol

    with assert_dask_compute(False):
        assert compute_n_splits(cv, da_X, da_y, da_groups) == sol
Example #10
0
def split_dataset(dataset):
    X = dataset.drop(y_col, axis=1)
    y = dataset[y_col]
    test_fold = (fold_pattern * (
        (dataset.shape[0] - 1) // len(fold_pattern) + 1))[:dataset.shape[0]]
    splitter = PredefinedSplit(test_fold)
    for train_index, test_index in splitter.split():
        X_train, X_test = safe_indexing(X, train_index), safe_indexing(
            X, test_index)
        y_train, y_test = safe_indexing(y, train_index), safe_indexing(
            y, test_index)
    return X_train, y_train, X_test, y_test
    def split(self, data):
        """Perform a data split with a fixed size for the test set"""
        data_size = 0
        if data.is_row_split_validation():
            #Time series split data by columns
            data_size = data.get_features().shape[1]
        else:
            data_size = data.get_features().shape[0]

        test_fold = [-1 for i in range(0, data_size - self.test_size_)]
        test_fold += [0 for i in range(data_size - self.test_size_, data_size)]
        splitter = PredefinedSplit(test_fold=test_fold)
        return splitter.split()
Example #12
0
def rolling_model_ENetH(df_X, df_Y):
    split_num = 200 * 60
    X_traindata = df_X[:split_num * 2]
    Y_traindata = df_Y[:split_num * 2]
    X_vdata = df_X[split_num:split_num * 2]
    X_testdata = df_X[split_num * 2:split_num * 3]
    Y_testdata = df_Y[split_num * 2:split_num * 3]

    # specify parameters and distributions to sample from

    num_valid_size = len(X_traindata) - len(X_vdata)
    test_fold = -1 * np.ones(len(X_traindata))
    test_fold[num_valid_size:] = 0
    ps = PredefinedSplit(test_fold)

    # specify parameters and distributions to sample from
    param_dist = {
        'alpha': uniform(0.00001, 0.1),
        'power_t': uniform(0.1, 0.9),
        'l1_ratio': uniform(0.1, 0.9),
        'eta0': uniform(0.00001, 0.1),
        'epsilon': uniform(0.01, 0.9),
        'max_iter': sp_randint(5, 10000),
        'tol': [0.01, 0.001, 0.0001, 0.00001],
        'fit_intercept': [True, False]
    }

    clf = SGDRegressor(shuffle=False,
                       loss='huber',
                       penalty='elasticnet',
                       random_state=100)

    # run randomized search
    n_iter_search = 100
    estim = RandomizedSearchCV(clf,
                               param_distributions=param_dist,
                               n_iter=n_iter_search,
                               scoring='r2',
                               cv=ps.split(),
                               iid=False,
                               random_state=100,
                               n_jobs=1)

    estim.fit(X_traindata, Y_traindata)
    best_estimator = estim.best_estimator_
    v_pred = best_estimator.predict(df_X[:split_num])
    v_performance_score = r2_score(df_Y[:split_num], v_pred)
    test_pre_y_array = best_estimator.predict(X_testdata)
    test_performance_score = r2_score(Y_testdata, test_pre_y_array)

    return v_performance_score, test_performance_score
Example #13
0
def train_self(scoring='accuracy'):
    csv_dir = Path("Features/CSV")
    for i in range(10):
        train_test_dir = csv_dir / f"train_test{i}"
        results_dir = Path("Results") / f"self{i}"
        results_dir.mkdir(exist_ok=True)
        for dataset_file in train_test_dir.glob("*test*"):
            dataset = str(dataset_file.stem).split("_test")[0]
            suffixes = ["train", "train_train", "train_val"]
            keys = [f"{s}" for s in suffixes]
            df_dict = {
                key: pd.read_csv(train_test_dir / f"{dataset}_{key}.csv")
                for key in keys
            }

            #xgboost with eval
            ###################################
            data = pd.concat([df_dict["train_train"], df_dict["train_val"]],
                             axis=0)
            data.reset_index(inplace=True, drop=True)
            val_idx = np.concatenate(
                ((-1) * np.ones(df_dict["train_train"].shape[0]),
                 np.zeros(df_dict["train_val"].shape[0])))
            ps = PredefinedSplit(val_idx)
            X = data.drop(columns=["Label", "microRNA_name"])
            y = data.Label.ravel()
            train_index, val_index = next(ps.split())
            X_val = X.iloc[val_index]
            y_val = y[val_index]

            output_file = results_dir / f"{dataset}_xgbs_val_results.csv"
            print(output_file)
            if not output_file.exists():
                clf = XGBClassifier(silent=True)
                grid_obj = GridSearchCV(clf,
                                        XGBS_PARAMS,
                                        scoring=scoring,
                                        cv=ps,
                                        verbose=3)
                fit_params = {
                    "eval_set": [(X_val, y_val)],
                    "early_stopping_rounds": 50
                }
                grid_obj.fit(X, y, **fit_params)

                print('\n Best estimator:')
                print(grid_obj.best_estimator_)
                print(grid_obj.best_score_ * 2 - 1)
                results = pd.DataFrame(grid_obj.cv_results_)
                results.to_csv(output_file, index=False)
Example #14
0
def cross_val_predict(X,
                      y,
                      estimator,
                      param_grid=None,
                      num_cvfolds=5,
                      num_tunefolds=3,
                      logger=None,
                      random_state=None):
    """
    Generates predictions for all instances in X using cross-validation.
    """

    # create folds
    np.random.seed(random_state)
    cv_folds = np.random.randint(num_cvfolds, size=X.shape[0])

    # store predictions
    p = y.copy().astype(float)

    # make predictions on each fold
    for i, (train_index,
            test_index) in enumerate(PredefinedSplit(cv_folds).split()):
        start = time.time()

        X_train, y_train = X[train_index], y[train_index]
        X_test = X[test_index]

        # tune the hyperparameters on this training fold
        if param_grid is not None:
            np.random.seed(random_state)
            tune_folds = np.random.randint(num_tunefolds,
                                           size=X_train.shape[0])

            model = GridSearchCV(clone(estimator),
                                 cv=PredefinedSplit(tune_folds),
                                 param_grid=param_grid)
            model = clone(model).fit(X_train, y_train)

        else:
            model = clone(estimator).fit(X_train, y_train)

        # make predictions on this test set
        y_score = model.predict_proba(X_test)[:, 1]
        np.put(p, test_index, y_score)

        if logger:
            logger.info('[CV] fold {}: {:.3f}s'.format(i, time.time() - start))

    assert len(p) == len(y)
    return p
    def __fun_param_set(self):
        """Function: Set parameters to train DNN based on input parameters
           Input:                                        
           Output:              
        """
        #set the number of neurons in hidden layers
        #layer-2 is half of layer-1
        neuron_num_1st_layer = [
            self.input_dim,
            int(1.5 * self.input_dim), 2 * self.input_dim
        ]
        neuron_num_2nd_layer = [int(x / 2) for x in neuron_num_1st_layer]
        self.neurons = list(zip(neuron_num_1st_layer, neuron_num_2nd_layer))
        self.neurons = [list(x) for x in self.neurons]

        self.optimizer = Adam(
        )  # By default we use Adam, and not tune learning rate

        #Set activation function for hidden layer
        self.activation_hidden = 'relu'

        #set activation function /loss function for output layer based on output dimensionality
        if self.output_dim > 1:
            self.activation_output = 'sigmoid'  #multi-class multi-label classification
            self.loss_fun = 'binary_crossentropy'
        else:
            self.activation_output = 'softmax'  #binary classfication
            self.loss_fun = 'categorical_crossentropy'

        #Set batch size
        if self.batch_size_flag == True:
            self.batch_size = [16, 32]  #Tune batch size
        else:
            self.batch_size = [32]  #fix batch size

        if self.dropout_flag == True:
            self.dropout_rate = [0.2, 0.4]
        else:
            self.dropout_rate = [0.2]

        #split training data into training and validation (fast version of model training)
        if self.cv == 1:
            t_size = int(self.x_train.shape[0] * 0.8)
            self.train_val_split = [-1] * t_size + [0] * (
                self.x_train.shape[0] - t_size)
            seed(self.rand_seed)
            shuffle(self.train_val_split)
            self.ps = PredefinedSplit(self.train_val_split)
        else:
            self.ps = self.cv
Example #16
0
            def rolling_model_PLS(
                    X_traindata=X_traindata,
                    Y_traindata_demean=np.ravel(Y_traindata_demean),
                    X_traindata1=X_traindata1,
                    Y_traindata1=np.ravel(Y_traindata1),
                    X_testdata=X_testdata,
                    Y_testdata=np.ravel(Y_testdata),
                    mean_Ytrain=mean_Ytrain):

                # specify parameters and distributions to sample from

                split_num = 200 * 60
                num_valid_size = split_num
                test_fold = -1 * np.ones(len(X_traindata))
                test_fold[num_valid_size:] = 0
                ps = PredefinedSplit(test_fold)

                # specify parameters and distributions to sample from
                param_dist = {
                    'n_components': sp_randint(1, 31),
                    'max_iter': sp_randint(50, len(X_traindata)),
                    'tol': [0.0001, 0.00001, 0.000001, 0.0000001]
                }

                PLS_model = PLSRegression(scale=False)

                # run gridsearchcv make_scorer(r2_score)
                n_iter_search = 50
                estim = RandomizedSearchCV(PLS_model,
                                           param_distributions=param_dist,
                                           scoring='r2',
                                           cv=ps.split(),
                                           iid=False,
                                           n_jobs=-1,
                                           n_iter=n_iter_search)

                estim.fit(X_traindata, Y_traindata_demean)
                best_estimator = estim.best_estimator_

                train_predict = best_estimator.predict(
                    X_traindata1) + mean_Ytrain
                IS_score = r2_score(Y_traindata1, train_predict)

                test_predict = best_estimator.predict(X_testdata) + mean_Ytrain
                test_predict = test_predict[:, 0]
                OOS_score = 1 - np.sum(
                    (Y_testdata - test_predict)**2) / np.sum(
                        (Y_testdata - mean_Ytrain)**2)

                return IS_score, OOS_score
Example #17
0
    def train(self, data, clf='rf', param_search='single', tune_size=0.15,
              scoring='roc_auc', n_jobs=1, verbose=1):
        """Trains a classifier with the specified training data.
        data: tuple including training data.
        clf: string of {'rf' 'lr', 'xgb'}.
        Returns trained classifier."""
        x_train, y_train, _, features = data

        if param_search == 'single' or tune_size == 0:
            model, params = self.classifier(clf, param_search='single')
            model.set_params(**params)

        elif tune_size > 0:
            t1 = self.out('tuning...')
            model, params = self.classifier(clf, param_search=param_search)
            train_len = x_train.shape[0]

            split_ndx = train_len - int(train_len * tune_size)
            sm_x_train, x_val = x_train[:split_ndx], x_train[split_ndx:]
            sm_train_fold = np.full(sm_x_train.shape[0], -1)
            val_fold = np.full(x_val.shape[0], 0)

            predefined_fold = np.append(sm_train_fold, val_fold)
            ps = PredefinedSplit(predefined_fold)
            cv = ps.split(x_train, y_train)
            m = GridSearchCV(model, params, scoring=scoring, cv=cv,
                             verbose=verbose, n_jobs=n_jobs)
            m.fit(x_train, y_train)
            model = m.best_estimator_
            self.time(t1)

        t1 = self.out('training...')

        if clf == 'lgb':
            cat_feat = ['app', 'device', 'os', 'channel', 'hour']
            cat_feat_ndx = [features.index(x) for x in cat_feat]
            train_len = x_train.shape[0]
            split_ndx = train_len - int(train_len * tune_size)
            sm_x_train, x_val = x_train[:split_ndx], x_train[split_ndx:]
            sm_y_train, y_val = y_train[:split_ndx], y_train[split_ndx:]
            eval_set = (x_val, y_val)
            model = model.fit(sm_x_train, sm_y_train, eval_set=eval_set,
                              early_stopping_rounds=50, eval_metric='auc',
                              categorical_feature=cat_feat_ndx)
        else:
            model = model.fit(x_train, y_train)

        self.time(t1)
        self.out(str(model))
        return model
def predefined_train_test_split(data, labels, folds, workflow, label_encoder):
    folds = np.asarray(folds)
    
    fold_encoder = LabelEncoder()
    split_encoded = fold_encoder.fit_transform(folds)
    
    num_classes = len(label_encoder.classes_)
    
    performance = {
        'classes': label_encoder.classes_.tolist(),
        'intervals': {key: np.sum(folds == key) for key in sorted(list(set(folds)))}
    }
    
    split = PredefinedSplit(split_encoded)
    for fold_index, (train_inds, test_inds) in enumerate(split.split()):
        train_x, train_y = [data[ii] for ii in train_inds], [labels[ii] for ii in train_inds]
        test_x, test_y = [data[ii] for ii in test_inds], [labels[ii] for ii in test_inds]
        
        prior_train = [0] * num_classes
        for yy in train_y:
            prior_train[yy] += 1
        
        prior_test = [0] * num_classes
        for yy in test_y:
            prior_test[yy] += 1
        
        clf = deepcopy(workflow)
        clf.fit(train_x, train_y)
        param_dict = {kk: vv.__dict__ for kk, vv in clf.named_steps.iteritems()}
        
        test_pred = clf.predict(test_x)
        
        test_ind = folds[test_inds[0]]
        performance[test_ind] = {
            'accuracy': metrics.accuracy_score(test_y, test_pred),
            'precision_micro': metrics.precision_score(test_y, test_pred, average='micro'),
            'precision_macro': metrics.precision_score(test_y, test_pred, average='micro'),
            'recall_micro': metrics.recall_score(test_y, test_pred, average='micro'),
            'recall_macro': metrics.recall_score(test_y, test_pred, average='macro'),
            'f1_score_micro': metrics.f1_score(test_y, test_pred, average='micro'),
            'f1_score_macro': metrics.f1_score(test_y, test_pred, average='macro'),
            'confusion_matrix': metrics.confusion_matrix(test_y, test_pred).tolist(),
            'prior_train': prior_train,
            'prior_test': prior_test,
            'model': serialise_dict(param_dict)
        }
    
    return serialise_dict(performance)
    def __init__(self, train, dev, config, cross_val=True):
        self.baseline_config = config
        self.split = 5
        
        if cross_val == False:
            train_samples = [-1 for i in range(len(train[0]))]
            dev_samples = [0 for i in range(len(dev[0]))]
            self.split = PredefinedSplit(test_fold=np.concatenate((train_samples, dev_samples)))
        
        self.train_x = np.concatenate((train[0], dev[0]))
        self.train_y = np.concatenate((train[1], dev[1])) 

        print(f'Finding optimal CNN model configuration with:')
        print(f"Number of classes: {self.baseline_config['data']['num_classes']}")
        print(f"Static: {self.baseline_config['CNN']['static']}")
        print(f"Dataset path: {self.baseline_config['data']['output']}\n\n")

        self.activation_function = self.baseline_config['CNN']['activation_function']
        self.filter_sizes = self.baseline_config['CNN']['filter_sizes']
        self.output_filters_per_size = self.baseline_config['CNN']['output_filters_per_size']
        self.dropout_rate = self.baseline_config['CNN']['dropout_rate']
        self.batch_size = None
        self.epochs = None

        self.batch_size, self.epochs = self.best_batch_size_and_epochs()
        individual_filter_size = self.best_individual_filter_size()
        self.filter_sizes = self.best_filter_size_combination(individual_filter_size)
        self.activation_function = self.best_activation_function()
        self.output_filters_per_size, self.dropout_rate = self.best_num_feature_maps_and_dropout()
        
        print('Optimal configuration:')
        self.print_configuration()
Example #20
0
def train_model(dataset, classifier, params):

    if params != None:
        split = PredefinedSplit(
            test_fold=[-1 for i in range(dataset['train'][0].shape[0])] +
            [0 for i in range(dataset['valid'][0].shape[0])])
        classifier = GridSearchCV(classifier, params, cv=split, refit=True)
        merged_input = sparse.vstack(
            [dataset['train'][0], dataset['valid'][0]])
        merged_output = np.concatenate(
            (dataset['train'][1], dataset['valid'][1]))
        classifier.fit(merged_input, merged_output)
    else:
        classifier.fit(dataset['train'][0], dataset['train'][1])

    prediction_train = f1_score(dataset['train'][1],
                                classifier.predict(dataset['train'][0]),
                                average=AVERAGE)
    prediction_valid = f1_score(dataset['valid'][1],
                                classifier.predict(dataset['valid'][0]),
                                average=AVERAGE)
    prediction_test = f1_score(dataset['test'][1],
                               classifier.predict(dataset['test'][0]),
                               average=AVERAGE)
    best_param = None if params == None else classifier.best_params_

    return prediction_train, prediction_valid, prediction_test, best_param
Example #21
0
    def find_best_params(self,validation_data_x,validation_data_y,n_jobs=1,params=[]):
        if not params:
            params = self.get_default_param_grid()

        merged_x = Data.merge_arrays(self.training_data_x, validation_data_x)
        merged_y = Data.merge_arrays(self.training_data_y, validation_data_y)
        test_fold = []

        for i in range(0,len(self.training_data_y)):
            test_fold.append(1)
        for i in range(0,len(validation_data_y)):
            test_fold.append(0)

        cv = PredefinedSplit(test_fold)

        gs = GridSearchCV(
            estimator=GaussianNB(),
            scoring='f1_micro',
            param_grid=params,
            n_jobs=n_jobs,
            cv=cv
        )

        gs.fit(merged_x,merged_y)

        best_params = gs.best_params_
        results = gs.cv_results_
        return best_params,results
def neighbors(train, test, target, cv: PredefinedSplit, k=5, n_trees=10):
    res_train = np.zeros((train.shape[0], 2))
    res_test = np.zeros((test.shape[0], 2))
    for i, (trn_idx, val_idx) in tqdm(enumerate(cv.split(train)),
                                      total=cv.get_n_splits()):
        target_trn = target.iloc[trn_idx]
        X_trn = train.iloc[trn_idx]
        X_val = train.iloc[val_idx]
        n = X_trn[target_trn == 0]
        p = X_trn[target_trn == 1]
        for j, X in enumerate([n, p]):
            u = build(X, n_trees)
            res_train[val_idx, j] = get_feat(X_val, u, k=k)
            res_test[:, j] += get_feat(test, u, k)
    res_test /= cv.get_n_splits()
    return res_train, res_test
Example #23
0
    def fit(self, X_train, y_train, X_val, y_val):

        if X_train.ndim != 2:
            raise Exception('ValueError: `X_train` is incompatible: expected ndim=4, found ndim='+str(X_train.ndim))
        elif X_val.ndim != 2:
            raise Exception('ValueError: `X_val` is incompatible: expected ndim=4, found ndim='+str(X_val.ndim))

        print('Dimension of training set is: {} and label is: {}'.format(X_train.shape, y_train.shape))
        print('Dimension of validation set is: {} and label is: {}'.format(X_val.shape, y_val.shape))

        X_all = np.concatenate((X_train, X_val),axis=0)
        y_all = np.concatenate((y_train, y_val),axis=0)

        # Create a list where train data indices are -1 and validation data indices are 0
        tr_index = np.full((X_train.shape[0]), -1)
        val_index = np.full((X_val.shape[0]), 0)
        split_index = np.concatenate((tr_index, val_index), axis=0).tolist()
        # Use the list to create PredefinedSplit
        pds = PredefinedSplit(test_fold = split_index)
        clf = GridSearchCV(estimator=SVC(), param_grid=self.tuned_parameters, cv=pds, scoring = 'accuracy')
        start = time.time()
        clf.fit(X_all , y_all)
        end = time.time()
        #Clasifying with an optimal parameter set
        Optimal_params = clf.best_params_
        print(Optimal_params)
        classifier = SVC(**Optimal_params)
        classifier.fit(X_train, y_train)
        dump(classifier, self.model_path)
        write_log(filepath=self.time_log, data=['time_log'], mode='w')
        write_log(filepath=self.time_log, data=[end-start], mode='a')
def train_predict(clf_key, classifier, param_grid, trainX, trainY, valX, valY,
                  testX, testY):
    all_tr_val_X = np.vstack((trainX, valX))
    all_tr_val_Y = np.hstack((trainY, valY))
    fold_meta = np.zeros(all_tr_val_X.shape[0])
    fold_meta[0:trainX.shape[0]] = -1
    cv = PredefinedSplit(test_fold=fold_meta)
    gcv = GridSearchCV(estimator=classifier,
                       param_grid=param_grid,
                       cv=cv,
                       verbose=0,
                       n_jobs=2,
                       scoring='accuracy')
    gcv.fit(all_tr_val_X, all_tr_val_Y)
    predictions = gcv.predict(testX)
    cm = confusion_matrix(testY, predictions)
    classes_lst = [
        'Corn', 'Cotton', 'Soy', 'Spring Wheat', 'Winter Wheat', 'Barley'
    ]
    creport = classification_report(y_true=testY,
                                    y_pred=predictions,
                                    target_names=classes_lst,
                                    digits=4,
                                    output_dict=True)
    creport_df = pd.DataFrame(creport).transpose()
    acc = accuracy_score(testY, predictions)
    print(creport)
    kappa_score = cohen_kappa_score(testY, predictions)
    print('Classifier : {}'.format(clf_key))
    print('best params: {}'.format(gcv.best_params_))
    print(
        'Accuracy is {}\n Kappa Score is {}\n confusion matrix is {}\n clf report is {}'
        .format(acc, kappa_score, cm, creport))
Example #25
0
    def find_best_params(self,validation_data_x,validation_data_y,alpha_vals,n_jobs=1):

        merged_x = Data.merge_arrays(self.training_data_x, validation_data_x)
        merged_y = Data.merge_arrays(self.training_data_y, validation_data_y)
        test_fold = []

        for i in range(0,len(self.training_data_y)):
            test_fold.append(1)
        for i in range(0,len(validation_data_y)):
            test_fold.append(0)

        cv = PredefinedSplit(test_fold)

        param = {"alpha": alpha_vals}
        gs = GridSearchCV(
            estimator=BernoulliNB(),
            scoring='f1_micro',
            param_grid=param,
            n_jobs=n_jobs,
            cv=cv
        )

        gs.fit(merged_x,merged_y)

        best_params = gs.best_params_
        results = gs.cv_results_
        return best_params,results
Example #26
0
def gridSearch_cv(train_x,
                  train_y,
                  test_x,
                  test_y,
                  param_grid,
                  folds,
                  scoring,
                  refit,
                  shuf,
                  dev_ratio=0):
    cv_folds = KFold(n_splits=folds, shuffle=shuf)
    if dev_ratio > 0:
        pre_fold = np.ones(train_x.shape[0]) * -1
        inds = np.random.choice(pre_fold.size,
                                size=math.floor(dev_ratio * train_x.shape[0]))
        pre_fold[inds] = 0
        cv_folds = PredefinedSplit(test_fold=pre_fold)
    svm = SVC()
    clf = GridSearchCV(estimator=svm,
                       param_grid=param_grid,
                       cv=cv_folds,
                       scoring=scoring,
                       refit=refit)
    clf.fit(train_x, train_y)
    logging.info("Best parameters in grid search:")
    logging.info("")
    logging.info(clf.best_params_)
    logging.info("Test scores:")
    score = clf.score(test_x, test_y)
    logging.info("test " + refit + " score: %0.3f ", score)
    pred_y = clf.predict(test_x)
    logging.info(classification_report(test_y, pred_y))
def tuning(X_train, Y_train, X_val, Y_val, classifier, params):
    '''
    Tunes hyperparameters by running the classifier trained using training data
    on the range of parameters given, and returns the parameters which give the
    best f1-score on test data
    '''
    #Combine training and validation into one set
    X = vstack([X_train,X_val])
    Y_train.extend(Y_val)
    Y = np.array(Y_train)
    
    #Mark the training-validation splits
    train_i = np.ones((X_train.shape[0],), dtype = int) * -1
    valid_i = np.zeros((X_val.shape[0],), dtype = int)
    split_fold = np.concatenate((train_i, valid_i))
    ps = PredefinedSplit(split_fold)
    
    param_search = GridSearchCV(classifier, params, scoring=metrics.make_scorer(metrics.f1_score, average='macro'), cv=ps, return_train_score=True)
    param_search.fit(X,Y)
    results = param_search.cv_results_
    best_params = param_search.best_params_
    
    #Plotting
    #test_scores = results.get('split0_test_score')
    #par_ranges = params.values()
    #plt.plot(par_ranges[0],test_scores,'r-')
    #plt.show
    
    
    return best_params, results
def random_search(model, model_type, df, task_type='regression', refit='r2', verbose=False):
	years_present = len(df.index.get_level_values('year').unique())
	starting_year = STARTING_YEAR
	last_year = starting_year + years_present - 1
	train_year = starting_year + round(years_present * .7)
	validation_year = starting_year + round((train_year-starting_year) * .8)

	test_year = train_year + 1
	X = df[(starting_year <= df.index.get_level_values('year')) & (df.index.get_level_values('year') <= train_year)]
	mask = (starting_year <= X.index.get_level_values('year')) & (X.index.get_level_values('year') <= validation_year)
	validation_fold = list(map(lambda x: -1 if x else 0, list(mask)))
	ps = PredefinedSplit(validation_fold)

	random_search = RandomizedSearchCV(model, param_distributions=get_param_grid(model_type=model_type, task_type=task_type), cv=ps, n_iter=10, n_jobs=-1, verbose=10, refit=True, scoring=refit, random_state=SEED)

	if verbose:
		print('Fitting with Random Search...')
	X_train = X.drop('crime_count', axis=1)
	y_train = X['crime_count']
	if task_type == 'classification':
		y_train = y_train.apply(lambda x: 1 if x > 0.5 else 0).round(0).astype(int)
	random_search.fit(X_train, y_train)
	if verbose:
		print('Done fitting...')

	return random_search.best_estimator_
Example #29
0
def main(data_dir, log_dir, source='xl-1542M-k40', n_train=500000, n_valid=10000, n_jobs=None, verbose=False):
    train_texts, train_labels = load_split(data_dir, source, 'train', n=n_train)
    valid_texts, valid_labels = load_split(data_dir, source, 'valid', n=n_valid)
    test_texts, test_labels = load_split(data_dir, source, 'test')

    vect = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_features=2**21)
    train_features = vect.fit_transform(train_texts)
    valid_features = vect.transform(valid_texts)
    test_features = vect.transform(test_texts)

    model = LogisticRegression(solver='liblinear')
    params = {'C': [1/64, 1/32, 1/16, 1/8, 1/4, 1/2, 1, 2, 4, 8, 16, 32, 64]}
    split = PredefinedSplit([-1]*n_train+[0]*n_valid)
    search = GridSearchCV(model, params, cv=split, n_jobs=n_jobs, verbose=verbose, refit=False)
    search.fit(sparse.vstack([train_features, valid_features]), train_labels+valid_labels)
    model = model.set_params(**search.best_params_)
    model.fit(train_features, train_labels)
    valid_accuracy = model.score(valid_features, valid_labels)*100.
    test_accuracy = model.score(test_features, test_labels)*100.
    data = {
        'source':source,
        'n_train':n_train,
        'valid_accuracy':valid_accuracy,
        'test_accuracy':test_accuracy
    }
    print(data)
    json.dump(data, open(os.path.join(log_dir, f'{source}.json'), 'w'))
Example #30
0
def _get_scores_and_estimators(experiment: Experiment) -> Tuple[List[float], List[Any]]:
    if experiment.test_set is not None:
        assert experiment.cross_validator is None, "Cannot use a cross validator with train test split"
        dataset = pd.concat([experiment.dataset, experiment.test_set])
        split = np.array([-1] * len(experiment.dataset) + [1] * len(experiment.test_set))
        cross_validator = PredefinedSplit(split)
    else:
        dataset = experiment.dataset
        cross_validator = experiment.cross_validator

    X = dataset.drop(columns=[experiment.label_column])
    y = dataset[experiment.label_column]
    if experiment.group_column is None:
        if experiment.average_scores_on_instances:
            groups = Series(range(len(X)), index=X.index)
        else:
            groups = None
    else:
        groups = X[experiment.group_column]
        X = X.drop(columns=[experiment.group_column])

    cv = check_cv(cross_validator, y, classifier=is_classifier(experiment.predictor))
    train_test = cv.split(X, y, groups)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=None, verbose=False,
                        pre_dispatch='2*n_jobs')
    scores_and_estimators = parallel(
        delayed(_fit_and_predict)(
            clone(experiment.predictor), X, y, train, test, groups, experiment.scorer)
        for train, test in train_test)
    scores_lists, estimators = zip(*scores_and_estimators)
    scores = [score for score_list in scores_lists for score in score_list]
    return scores, estimators
Example #31
0
def go(data_dict,feats_to_use, params={"seed":0,"silent":False,"n_jobs":-1},
 parameter_tuning=False):
    
    '''
    if with_gpu:
        xgb = XGBRegressor(seed=0, silent=False, tree_method='gpu_hist', n_gpus=-1)
    else:
        xgb = XGBRegressor(seed=0, silent=False, n_jobs=-1)
    '''
    X_train=data_dict['X_train'][feats_to_use].copy()
    y_train=data_dict['y_train'].copy()
    X_test=data_dict['X_test'][feats_to_use].copy()
    X_val=data_dict['X_val'][feats_to_use].copy()
    y_val=data_dict['y_val'].copy()

    
    
    if parameter_tuning:
        fit_params={
        "early_stopping_rounds":10, 
        "eval_metric" : "rmse", 
        "eval_set" : [(X_val,y_val)]}
        xgb=XGBRegressor() 
        train_val_features=pd.concat([X_train,X_val])
        train_val_labels=pd.concat([y_train,y_val])
        test_fold = np.zeros(train_val_features.shape[0])   # initialize all index to 0
        test_fold[:X_train.shape[0]] = -1   # set index of training set to -1, indicating not to use it in validation
        
        ps=PredefinedSplit(test_fold=test_fold)
        X_train=data_dict['X_train'][feats_to_use]
        y_train=data_dict['y_train']
        X_test=data_dict['X_test'][feats_to_use]
        grid=GridSearchCV(xgb,params,fit_params=fit_params,scoring=RMSE , cv=ps, verbose=32, n_jobs=-1)
        start=time.time()
        grid.fit(train_val_features,train_val_labels)
        elapsed=time.time()-start
        print (elapsed)
        print ('best params:',grid.best_params_)
        print ('best score:',grid.best_score_)

        return grid.best_params_, grid.best_estimator_
        
    else:
        xgb=XGBRegressor(**params)
        print (xgb)
    
        print ('start xgboost training')
        start=time.time()
        eval_set=[(X_val,y_val)]
        xgb.fit(X_train,y_train, eval_set=eval_set,eval_metric='rmse',early_stopping_rounds=30)
        elapsed=time.time()-start
        print (elapsed)
        data_dict['y_pred']=np.exp(xgb.predict(X_test))-1

        #generate submission
        data_dict['X_test']['item_cnt_month']=data_dict['y_pred']
        test=pd.read_csv('test.csv')
        submission=pd.merge(test,data_dict['X_test'], 
            on=['shop_id','item_id'],how='left')[['ID','item_cnt_month']]
        return submission, xgb
def test_predefinedsplit_with_kfold_split():
    # Check that PredefinedSplit can reproduce a split generated by Kfold.
    folds = -1 * np.ones(10)
    kf_train = []
    kf_test = []
    for i, (train_ind, test_ind) in enumerate(KFold(5, shuffle=True).split(X)):
        kf_train.append(train_ind)
        kf_test.append(test_ind)
        folds[test_ind] = i
    ps_train = []
    ps_test = []
    ps = PredefinedSplit(folds)
    # n_splits is simply the no of unique folds
    assert_equal(len(np.unique(folds)), ps.get_n_splits())
    for train_ind, test_ind in ps.split():
        ps_train.append(train_ind)
        ps_test.append(test_ind)
    assert_array_equal(ps_train, kf_train)
    assert_array_equal(ps_test, kf_test)
Example #33
0
    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.

        y : array-like, shape (n_samples,)
            The target variable for supervised learning problems.

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.

        Yields
        ------
        train : ndarray
            The training set indices for that split.

        test : ndarray
            The testing set indices for that split.
        """
        X, y, groups = indexable(X, y, groups)
        n_samples = X.shape[0]
        if self.n_splits > n_samples:
            raise ValueError(
                ("Cannot have number of splits n_splits={0} greater"
                 " than the number of samples: n_samples={1}."
                 ).format(self.n_splits, n_samples))

        # generate test fold
        test_fold = np.arange(n_samples, dtype=int) % self.n_splits
        cv = PredefinedSplit(test_fold)

        return(cv.split())