Esempio n. 1
0
def main(data_dir,
         log_dir,
         source='xl-1542M-k40',
         n_train=500000,
         n_valid=10000,
         n_jobs=None,
         verbose=False):
    train_texts, train_labels = load_split(data_dir,
                                           source,
                                           'train',
                                           n=n_train)
    valid_texts, valid_labels = load_split(data_dir,
                                           source,
                                           'valid',
                                           n=n_valid)
    test_texts, test_labels = load_split(data_dir, source, 'test')

    vect = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_features=2**21)
    train_features = vect.fit_transform(train_texts)
    valid_features = vect.transform(valid_texts)
    test_features = vect.transform(test_texts)

    model = LogisticRegression(solver='liblinear')
    params = {
        'C':
        [1 / 64, 1 / 32, 1 / 16, 1 / 8, 1 / 4, 1 / 2, 1, 2, 4, 8, 16, 32, 64]
    }
    split = PredefinedSplit([-1] * n_train + [0] * n_valid)
    search = GridSearchCV(model,
                          params,
                          cv=split,
                          n_jobs=n_jobs,
                          verbose=verbose,
                          refit=False)
    search.fit(sparse.vstack([train_features, valid_features]),
               train_labels + valid_labels)
    model = model.set_params(**search.best_params_)
    model.fit(train_features, train_labels)
    valid_accuracy = model.score(valid_features, valid_labels) * 100.
    test_accuracy = model.score(test_features, test_labels) * 100.
    data = {
        'source': source,
        'n_train': n_train,
        'valid_accuracy': valid_accuracy,
        'test_accuracy': test_accuracy
    }
    print(data)
    json.dump(data, open(os.path.join(log_dir, f'{source}.json'), 'w'))
def predefined_train_test_split(data, labels, folds, workflow, label_encoder):
    folds = np.asarray(folds)
    
    fold_encoder = LabelEncoder()
    split_encoded = fold_encoder.fit_transform(folds)
    
    num_classes = len(label_encoder.classes_)
    
    performance = {
        'classes': label_encoder.classes_.tolist(),
        'intervals': {key: np.sum(folds == key) for key in sorted(list(set(folds)))}
    }
    
    split = PredefinedSplit(split_encoded)
    for fold_index, (train_inds, test_inds) in enumerate(split.split()):
        train_x, train_y = [data[ii] for ii in train_inds], [labels[ii] for ii in train_inds]
        test_x, test_y = [data[ii] for ii in test_inds], [labels[ii] for ii in test_inds]
        
        prior_train = [0] * num_classes
        for yy in train_y:
            prior_train[yy] += 1
        
        prior_test = [0] * num_classes
        for yy in test_y:
            prior_test[yy] += 1
        
        clf = deepcopy(workflow)
        clf.fit(train_x, train_y)
        param_dict = {kk: vv.__dict__ for kk, vv in clf.named_steps.iteritems()}
        
        test_pred = clf.predict(test_x)
        
        test_ind = folds[test_inds[0]]
        performance[test_ind] = {
            'accuracy': metrics.accuracy_score(test_y, test_pred),
            'precision_micro': metrics.precision_score(test_y, test_pred, average='micro'),
            'precision_macro': metrics.precision_score(test_y, test_pred, average='micro'),
            'recall_micro': metrics.recall_score(test_y, test_pred, average='micro'),
            'recall_macro': metrics.recall_score(test_y, test_pred, average='macro'),
            'f1_score_micro': metrics.f1_score(test_y, test_pred, average='micro'),
            'f1_score_macro': metrics.f1_score(test_y, test_pred, average='macro'),
            'confusion_matrix': metrics.confusion_matrix(test_y, test_pred).tolist(),
            'prior_train': prior_train,
            'prior_test': prior_test,
            'model': serialise_dict(param_dict)
        }
    
    return serialise_dict(performance)
def Hyper_parameter_tuning(X, y, model_estimator, configs):
    n_train = len(X)
    validation_fold_ = [-1] * ((int)(0.8 * n_train)) + [0] * ((int)(0.2 * n_train))
    validation_fold = np.random.permutation(validation_fold_)

    # Using GridSearchCV to tune the hyper-parameters
    ps = PredefinedSplit(validation_fold)
    clf = GridSearchCV(model_estimator,
                       configs,
                       return_train_score=True,
                       cv=ps,
                       refit=True,
                       n_jobs=-1,
                       scoring=make_scorer(mean_squared_error, greater_is_better=False))
    clf.fit(X, y)
    return clf
def k_fold_cv(X, y, feature_desc):
    # since fold 4 will be used as a blind set and not part of training, it is removed from fold_ids list.
    fold_ids = pd.read_csv(
        "data/raw_data/CV_fold_ids_trval.csv")['FoldID'][0:132]
    ps = PredefinedSplit(fold_ids)
    fold_id = 0
    y = y[valence_classifier.label_type]
    for train_index, test_index in ps.split():
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf = tune_on_devset(X_train, y_train, X_test, y_test)
        joblib.dump(
            clf,
            "data/models/" + feature_desc + "_fold" + str(fold_id) + '.pkl')
        fold_id += 1
    return
Esempio n. 5
0
    def train(self):
        print("- Design the baseline")
        self.build()
        print("- Read train and dev data sets")
        x_traindev, y_traindev, dev_fold = MLcls.read_data(train_file=self.args.train_file,
                                                           dev_file=self.args.dev_file)

        print("- Train the baseline...")
        start = time.time()
        model = GridSearchCV(self.pipeline, self.parameters, cv=PredefinedSplit(test_fold=dev_fold),
                             verbose=5, scoring='f1_weighted')
        model.fit(x_traindev, y_traindev)
        end = time.time()
        print("\t+ Done: %.4f(s)" % (end - start))
        self.best_model = model.best_estimator_
        MLcls.save(self.best_model, self.args.model_name)
Esempio n. 6
0
    def get_validation_splits(self, data):
        """ Create cross validation folds of validation data either by date or randomly.

        :param data:Dataframe to split
        :return: Stratified folds of the validation split values.
        """

        if self.file_args['train_test_method'] == 'date':
            week_folds = self.create_week_cv_folds(
                data, self.file_args['num_cv_folds'])
            ps = PredefinedSplit(week_folds)
        if self.file_args['train_test_method'] == 'random':
            ps = StratifiedKFold(n_splits=self.file_args['num_cv_folds'],
                                 shuffle=False,
                                 random_state=self.file_args['seed'])
        return ps
Esempio n. 7
0
            def rolling_model_RF(X_traindata=X_traindata,
                                 Y_traindata_demean=np.ravel(Y_traindata_demean),
                                 X_traindata1=X_traindata1,
                                 Y_traindata1=np.ravel(Y_traindata1),
                                 X_testdata=X_testdata,
                                 Y_testdata=np.ravel(Y_testdata),
                                 mean_Ytrain=mean_Ytrain):


                # specify parameters and distributions to sample from

                split_num = 200 * 60
                num_valid_size = split_num
                test_fold = -1 * np.ones(len(X_traindata))
                test_fold[num_valid_size:] = 0
                ps = PredefinedSplit(test_fold)

                # specify parameters and distributions to sample from
                param_dist = {"max_features": sp_randint(5, 100),
                              "max_depth": sp_randint(3, 10),
                              "min_samples_split": sp_randint(10, 1000),
                              "min_samples_leaf": sp_randint(10, 1000),
                              "n_estimators": sp_randint(3, 100),
                              "oob_score": [True, False]
                              }

                clf_RF = RandomForestRegressor(random_state=100)

                # run randomized search
                n_iter_search = 50
                estim = RandomizedSearchCV(clf_RF, param_distributions=param_dist,
                                           n_iter=n_iter_search, scoring='r2',n_jobs=-1,
                                           cv=ps.split(), iid=False, random_state=100)

                estim.fit(X_traindata, Y_traindata_demean)
                best_estimator = estim.best_estimator_

                best_VIP = best_estimator.feature_importances_

                train_predict = best_estimator.predict(X_traindata1) + mean_Ytrain
                IS_score = r2_score(Y_traindata1, train_predict)

                test_predict = best_estimator.predict(X_testdata) + mean_Ytrain
                OOS_score = 1- np.sum((Y_testdata-test_predict)**2)/sum((Y_testdata-mean_Ytrain)**2)


                return IS_score, OOS_score, best_VIP
Esempio n. 8
0
def grid_search(dataset_path, verbos, saving_path, min_gamma, max_gamma, num_gamma, min_c, max_c, num_c, kernel,
                train = None, dev = None, test = None):
    """
    grid search function for SVM
    see help for explanation about the parameters
    if, train, dev and test are given, ignores dataset_path and uses them instead
    """
    if train is None or dev is None or test is None:
        train, dev, test = ML_util.get_dataset(dataset_path)

    train_squeezed = ML_util.squeeze_clusters(train)
    train_features, train_labels = ML_util.split_features(train_squeezed)
    dev_squeezed = ML_util.squeeze_clusters(dev)

    train_dev = np.concatenate((train_squeezed, dev_squeezed))
    train_dev_features, train_dev_labels = ML_util.split_features(train_dev)
    test_inds = np.concatenate((-1 * np.ones((len(train_squeezed))), np.zeros((len(dev_squeezed)))))
    ps = PredefinedSplit(test_inds)         

    gammas = np.logspace(min_gamma, max_gamma, num_gamma)
    cs = np.logspace(min_c, max_c, num_c)
    
    print()
    parameters = {'C': cs, 'gamma': gammas}
    model = svm.SVC(kernel = 'rbf', class_weight = 'balanced')
    clf = GridSearchCV(model, parameters, cv = ps)
    print('Starting grid search...')
    start = time.time()
    clf.fit(train_dev_features, train_dev_labels)
    end = time.time()
    print('Grid search completed in %.2f seconds, best parameters are:' % (end - start)) 
    print(clf.best_params_)

    C = clf.best_params_['C']
    gamma = clf.best_params_['gamma']
    calssifier = svm.SVC(kernel = kernel, class_weight = 'balanced', C = C, gamma = gamma) # need to create another one as the other trains on both train and dev
    calssifier.fit(train_features, train_labels)

    if verbos:
        scores = clf.cv_results_['mean_test_score']
        cs = [round(v, 3) for v in cs]
        gammas = [round(v, 9) for v in gammas]
        create_heatmap(gammas, cs, 'Gamma', 'C', 'SVM Grid Search', scores.reshape((len(cs), len(gammas))), path = saving_path)

    print()
    print('Starting evaluation on test set...')
    return evaluate_predictions(calssifier, test, verbos)
Esempio n. 9
0
def support_vector_machine(sampling = False, isNotebook = False):
    print("="*60)
    print("Running support vector machine...")
    DATA_FILE = utils.get_data_directory()

    # The argument of the function will determine weather we use oversampling or not
    if(sampling):
        process_method = preprocess.oversample(DATA_FILE)
    else:
        process_method = preprocess.preprocess_data(DATA_FILE)

    X, y = process_method
    X_train, X_test, y_train, y_test = utils.split_data(X, y, 0.6)
    X_val, X_test, y_val, y_test = utils.split_data(X_test, y_test, 0.5)

    X_grid = np.concatenate((X_train, X_val))
    y_grid = np.concatenate((y_train, y_val))
    separation_boundary = [-1 for _ in y_train] + [0 for _ in y_val]
    ps = PredefinedSplit(separation_boundary)
    
    param_grid = {
        'C': [1.0, 10.0, 100.0, 1000.0],
        'gamma': [0.01, 0.10, 1.00, 10.00],
        'kernel': ['rbf', 'poly']
    }

    clf = GridSearchCV(SVC(random_state=0, probability=True), param_grid, cv=ps)

    model = clf.fit(X_grid, y_grid)
    train_acc = model.score(X_train, y_train)
    val_acc = model.score(X_val, y_val)
    test_acc = model.score(X_test, y_test)
    print(f'training score: {round(train_acc, 3)}')
    print(f'validation score: {round(val_acc, 3)}')
    print(f'testing score: {round(test_acc, 3)}')
    report_dict = classification_report(y_test, model.predict(X_test), output_dict = True, target_names=["No", "Yes"])

    weights = permutation_importance(model, X_test, y_test)
    top_weights = list(sorted(enumerate(weights.importances_mean), key = lambda x: x[1], reverse = True))

    if isNotebook:
        return top_weights, model
    else:
        utils.display_metrics(report_dict)

    utils.log_results(top_weights)
    utils.generate_report("SVM", "SVM", model, X_test, y_test, report_dict)
Esempio n. 10
0
def do_grid_search_ridge(X_train, y_train, X_val, y_val):
    # Now let's use sklearn to help us do hyperparameter tuning
    # GridSearchCv.fit by default splits the data into training and
    # validation itself; we want to use our own splits, so we need to stack our
    # training and validation sets together, and supply an index
    # (validation_fold) to specify which entries are train and which are
    # validation.
    X_train_val = np.vstack((X_train, X_val))
    y_train_val = np.concatenate((y_train, y_val))
    val_fold = [-1] * len(X_train) + [0] * len(X_val)  # 0 corresponds to validation

    # Now we set up and do the grid search over l2reg. The np.concatenate
    # command illustrates my search for the best hyperparameter. In each line,
    # I'm zooming in to a particular hyperparameter range that showed promise
    # in the previous grid. This approach works reasonably well when
    # performance is convex as a function of the hyperparameter, which it seems
    # to be here.
    param_grid = [
        {
            "l2reg": np.unique(
                np.concatenate(
                    (10.0 ** np.arange(-6, 1, 0.3), np.arange(0.01, 0.05, 0.005))
                )
            )
        }
    ]

    ridge_regression_estimator = RidgeRegression()
    grid = GridSearchCV(
        ridge_regression_estimator,
        param_grid,
        return_train_score=True,
        cv=PredefinedSplit(test_fold=val_fold),
        refit=True,
        scoring=make_scorer(mean_squared_error, greater_is_better=False),
    )
    grid.fit(X_train_val, y_train_val)

    df = pd.DataFrame(grid.cv_results_)
    # Flip sign of score back, because GridSearchCV likes to maximize,
    # so it flips the sign of the score if "greater_is_better=FALSE"
    df["mean_test_score"] = -df["mean_test_score"]
    df["mean_train_score"] = -df["mean_train_score"]
    cols_to_keep = ["param_l2reg", "mean_test_score", "mean_train_score"]
    df_toshow = df[cols_to_keep].fillna("-")
    df_toshow = df_toshow.sort_values(by=["param_l2reg"])
    return grid, df_toshow
Esempio n. 11
0
def main():
    """
    Runs an extensive grid search for n_reviews = 1, 2, 5, 10, 'all', Hyperparameters C = 0.1, 1, 10, 100, 1000
    and max_features = 500, 1000, 5000, 10000.

    Outputs the best parameters for each n in n_reviews.
    """
    for n_reviews in (1, 2, 5, 10, 'all'):
        print("n_reviews: ", n_reviews)

        n_samples = 20_000

        dataset_path = f'data/datasets/dataset_{n_reviews}_train.pkl'
        with open(dataset_path, 'rb') as fd:
            data = pickle.load(fd)
            X_train = [' '.join(reviews)
                       for gender, reviews in data][:n_samples]
            y_train = [gender for gender, reviews in data][:n_samples]

        validation_size = 0.25
        test_size = 0.25
        train_size = 1 - test_size - validation_size

        corpus_train, corpus_test, y_train, y_test = train_test_split(
            X_train, y_train, test_size=test_size)

        param_grid = {
            'max_features': [500, 1000, 5000, 10000],
            'C': [0.01, 0.1, 1, 10, 100, 1000],
        }

        ps = PredefinedSplit(test_fold=np.concatenate((
            -np.ones(int(train_size * len(X_train))),
            np.zeros(ceil(validation_size * len(X_train))))))
        gs = GridSearchCV(GenderEstimator(),
                          param_grid,
                          cv=ps,
                          n_jobs=4,
                          verbose=3)

        gs.fit(corpus_train, y_train)

        best_score = gs.best_score_
        best_params = gs.best_params_

        print("Best parameters: ", best_params)
        print("Best score: ", best_score)
Esempio n. 12
0
def random_forest(cfg):
    # Load data
    train_df, valid_df, test_df = get_data(cfg)
    df = pd.concat([train_df, valid_df])

    # Remove columns and split data into (X,y)
    df = df.drop([
        'State_AL', 'State_NC', 'isNaN_rep_income', 'State_FL', 'State_LA',
        'isNaN_uti_card_50plus_pct', 'State_SC', 'State_GA', 'State_MS',
        'auto_open_36_month_num', 'card_open_36_month_num', 'ind_acc_XYZ'
    ],
                 axis=1)
    X = df.drop("Default_ind", axis=1).values
    y = df["Default_ind"].values

    # Below 2 lines needed for cross-validation in RandomizedSearchCV
    split_index = [-1] * len(train_df) + [0] * len(valid_df)
    pds = PredefinedSplit(test_fold=split_index)

    # Create classifier and the hyperparameter search space
    classifier = RandomForestClassifier(n_jobs=-1, verbose=1)
    param_grid = {
        "n_estimators": np.arange(50, 1000, 100),
        "max_depth": np.arange(1, 20),
        "criterion": ["gini", "entropy"],
        "min_samples_split": np.arange(2, 10),
        "max_features": [0.8, "sqrt", "log2"],
        "min_samples_leaf": np.arange(1, 5),
        "bootstrap": [True, False],
    }

    model = RandomizedSearchCV(
        estimator=classifier,
        param_distributions=param_grid,
        scoring="f1",
        n_iter=700,
        verbose=1,
        n_jobs=1,
        cv=pds,
    )

    model.fit(X, y)
    print(model.best_score_)
    print(model.best_estimator_.get_params())
    with open("rf.pkl", "wb") as f:
        pickle.dump(model.best_estimator_, f)
Esempio n. 13
0
def prepare_data_gridCrossvalidation(path_samples):

    # ------ Fetch samples
    samples_train = fetch_samples(os.path.join(path_samples, 'train'))
    samples_test = fetch_samples(os.path.join(path_samples, 'test'))

    test_fold = []
    for sample in samples_train:
        test_fold.append(sample['fold'])

    # ------ Create feature vector
    X_train, X_test, Y_train, Y_test, class_names, fvector_labels = create_fvector_train_test(
        samples_train, samples_test)

    folds = PredefinedSplit(test_fold)

    return X_train, X_test, Y_train, Y_test, class_names, fvector_labels, folds, samples_train, samples_test
Esempio n. 14
0
    def defined(self, test_record_names):
        """Run evaluation ith previously specified detectors on previously
        specified records. Do not use cross-validation but do a predefined
        split taking the given records as test records and all other records
        as training records.

        Args:
            test_record_names (list of str): List of record names to use for
                testing. All other records known to this evaluator are used for
                training the detectors.
        """
        test_fold = [
            0 if record.record_name in test_record_names else -1
            for record in self.records
        ]
        self.cval = PredefinedSplit(test_fold)
        return self._eval_cross_validator()
Esempio n. 15
0
    def generate_cv_splitter(self, df, p_ids_as_testsets=[]):
        """DEPRECATED: Since BayesOpt requires at least 2 cv splits,
        this function will
        return a cv object that returns indices where the provided profile IDs
        act as testsets. The given df is expected to hold the original train
        and testset but not the validation set as it is part of the early
        stopping criteria.
        """
        assert len(p_ids_as_testsets) > 1, 'provide at least two p_ids that ' \
                                           'shall act as testsets!'
        df.loc[:, 'cv_split'] = np.NaN
        for p_enum, p_id in enumerate(p_ids_as_testsets):
            df.loc[df[self.PROFILE_ID_COL] == p_id, ['cv_split']] = p_enum
        df.fillna({'cv_split': -1}, inplace=True)

        ps = PredefinedSplit(test_fold=df['cv_split'].values)
        return ps
Esempio n. 16
0
def preprocessed_data(features_dataframe: pd.DataFrame) -> PreprocessedData:
    kwargs = {
        "X_train":
        features_dataframe[:4].to_numpy(),
        "y_train":
        np.array([[0, 0, 0, 1], [1, 0, 0, 1], [1, 0, 1, 0], [1, 1, 0, 0]],
                 dtype=np.float32),
        "X_test":
        features_dataframe[4:5].to_numpy(),
        "y_test":
        np.array([[0, 0, 0, 1]], dtype=np.float32),
        "splits":
        PredefinedSplit([-1, -1, -1, 0]),
        "lb":
        MultiLabelBinarizer(),
    }
    return PreprocessedData(**kwargs)
def svm_exp():
    print("=" * 60)
    print("Running experiement on SVM...")
    TRAIN_SET = utils.get_data_directory()
    TEST_SET = utils.get_data_directory(fileName="/experiment-dataset.csv")

    X, y = preprocess.oversample(TRAIN_SET)
    X = np.delete(X, slice(4, 13), 1)
    X_train, X_val, y_train, y_val = utils.split_data(X, y, 0.8)
    X_test, y_test = preprocess.preprocess_experiment(TEST_SET)

    X_grid = np.concatenate((X_train, X_val))
    y_grid = np.concatenate((y_train, y_val))
    separation_boundary = [-1 for _ in y_train] + [0 for _ in y_val]
    ps = PredefinedSplit(separation_boundary)

    param_grid = {
        'C': [1.0, 10.0, 100.0, 1000.0],
        'gamma': [0.01, 0.10, 1.00, 10.00],
        'kernel': ['rbf', 'poly']
    }

    print(X_train.shape)
    clf = GridSearchCV(SVC(random_state=0), param_grid, cv=ps)

    model = clf.fit(X_grid, y_grid)
    train_acc = model.score(X_train, y_train)
    val_acc = model.score(X_val, y_val)
    test_acc = model.score(X_test, y_test)
    print(f'training score: {round(train_acc, 3)}')
    print(f'validation score: {round(val_acc, 3)}')
    print(f'testing score: {round(test_acc, 3)}')
    report_dict = classification_report(y_test,
                                        model.predict(X_test),
                                        output_dict=True,
                                        target_names=["No", "Yes"])
    utils.display_metrics(report_dict)

    imps = permutation_importance(model, X_test, y_test)
    top_feature_importances = list(
        sorted(enumerate(imps.importances_mean),
               key=lambda x: x[1],
               reverse=True))
    utils.log_results(top_feature_importances)
    utils.generate_report("Experiment SVM", "Experimental SVM", model, X_test,
                          y_test, report_dict)
Esempio n. 18
0
    def validate(self, cv_splits, num_runs):
        x = pd.concat([self.x_train, self.x_val], axis=0)
        y = pd.concat([self.y_train, self.y_val], axis=0)

        if cv_splits == 1:
            splitter = PredefinedSplit([-1 for _ in range(len(x) - 12)] + [0 for _ in range(12)])
            split = list(splitter.split(X=x, y=y)) * num_runs
        else:
            splitter = TimeSeriesSplit(cv_splits, max_train_size=len(x) - 12)
            split = list(splitter.split(X=x, y=y)) * num_runs

        res = map(self._validate, split)
        res = np.mean(list(res), axis=0)

        # K.clear_session()

        return res[0][0], res[1][0]
Esempio n. 19
0
    def __init__(self,
                 parameters,
                 n_iter=50,
                 n_initial=10,
                 n_jobs=1,
                 scoring=None,
                 iid=True,
                 verbose=0,
                 pre_dispatch='2*n_jobs',
                 random_state=None,
                 error_score='raise',
                 return_train_score=False,
                 results_filename='hps_results.pkl'):

        fixed_params = {}
        search_spaces = {}
        for par, val in parameters.items():
            if val.__class__ in (Real, Integer, Categorical):
                search_spaces[par] = val
            else:
                fixed_params[par] = val

        self.estimator = SklEstimator(**fixed_params)
        self.results_filename = results_filename

        # para enganar o scikit-learn
        self.X = np.arange(10)
        vfold = np.zeros((10, ), np.int)
        vfold[:5] = -1
        psplit = PredefinedSplit(vfold)

        super().__init__(self.estimator,
                         search_spaces=search_spaces,
                         optimizer_kwargs=dict(n_initial_points=n_initial),
                         n_iter=n_iter,
                         n_jobs=n_jobs,
                         scoring=scoring,
                         fit_params=None,
                         iid=iid,
                         refit=False,
                         cv=psplit,
                         verbose=verbose,
                         pre_dispatch=pre_dispatch,
                         random_state=random_state,
                         error_score=error_score,
                         return_train_score=return_train_score)
def get_lr_model_with_cv_on_clean(alphas, X_train_all, Y_train_all, ms, clean_task, T):
    # Spread the weights on sample level and define the CV split
    all_alphas = np.repeat(alphas*T, ms)
    indexes_cv = (-1)*np.ones(X_train_all.shape[0])
    clean_begins = np.sum(ms[:clean_task])
    curr_m = ms[clean_task]
    all_alphas[clean_begins:(clean_begins + curr_m)] = all_alphas[clean_begins:(clean_begins + curr_m)]*(5/4)
    for l in range(5):
        indexes_cv[(clean_begins + l*(int(curr_m/5))):(clean_begins + (l+1)*int((curr_m/5)))] = l
        
    ps = PredefinedSplit(indexes_cv)
    
    # Train on all data, with 5-fold CV on the clean data
    lr = LogisticRegressionCV(fit_intercept = False, cv = ps)
    lr.fit(X_train_all, Y_train_all, sample_weight=all_alphas)
    best_w = lr.coef_[0]
    return best_w
Esempio n. 21
0
def rolling_model_GBRTH(df_X, df_Y):
    split_num = 200 * 60
    X_traindata = df_X[:split_num * 2]
    Y_traindata = df_Y[:split_num * 2]
    X_vdata = df_X[split_num:split_num * 2]
    X_testdata = df_X[split_num * 2:split_num * 3]
    Y_testdata = df_Y[split_num * 2:split_num * 3]

    # specify parameters and distributions to sample from

    num_valid_size = len(X_traindata) - len(X_vdata)
    test_fold = -1 * np.ones(len(X_traindata))
    test_fold[num_valid_size:] = 0
    ps = PredefinedSplit(test_fold)

    # specify parameters and distributions to sample from
    param_dist = {
        "max_features": sp_randint(5, 100),
        "max_depth": sp_randint(3, 12),
        "min_samples_split": sp_randint(100, 1000),
        "min_samples_leaf": sp_randint(100, 1000),
        "n_estimators": sp_randint(5, 100),
        "learning_rate": uniform(0.001, 0.1),
        "subsample": uniform(0.6, 0.4)
    }

    clf_GBRT = GradientBoostingRegressor(loss='huber', random_state=100)

    # run randomized search
    n_iter_search = 100
    estim = RandomizedSearchCV(clf_GBRT,
                               param_distributions=param_dist,
                               n_iter=n_iter_search,
                               scoring='r2',
                               cv=ps.split(),
                               iid=False,
                               random_state=100)

    estim.fit(X_traindata, Y_traindata)
    best_estimator = estim.best_estimator_
    v_pred = best_estimator.predict(df_X[:split_num])
    v_performance_score = r2_score(df_Y[:split_num], v_pred)
    test_pre_y_array = best_estimator.predict(X_testdata)
    test_performance_score = r2_score(Y_testdata, test_pre_y_array)

    return v_performance_score, test_performance_score
    def __fun_param_set(self):
        """Function: Set parameters to train DNN based on input parameters
           Input:                                        
           Output:              
        """
        #set the number of neurons in hidden layers
        #layer-2 is half of layer-1
 
        neuron_num_1st_layer = [int(0.1*self.input_dim), int(0.5*self.input_dim)]
        neuron_num_2nd_layer = [int(x/2) for x in neuron_num_1st_layer]
        self.neurons = list(zip(neuron_num_1st_layer, neuron_num_2nd_layer))
        self.neurons = [list(x) for x in self.neurons]

        self.optimizer = Adam() 

        #Set activation function for hidden layer
        self.activation_hidden = 'relu'        

        #set activation function /loss function for output layer based on output dimensionality
        if self.output_dim > 1:
            self.activation_output = 'sigmoid' #multi-class multi-label classification
            self.loss_fun = 'binary_crossentropy'
        else:
            self.activation_output = 'softmax' #binary classfication
            self.loss_fun = 'categorical_crossentropy'

        #Set batch size
        if self.batch_size_flag == True:
            self.batch_size = [16, 32] #Tune batch size
        else:
            self.batch_size = [32] #fix batch size
        
        if self.dropout_flag == True:
            self.dropout_rate = [0.2, 0.4]
        else:
            self.dropout_rate = [0.4]     

        #split training data into training and validation (fast version of model training)
        if self.cv == 1:
            t_size = int(self.x_train.shape[0]*0.8)
            self.train_val_split = [-1]*t_size + [0]*(self.x_train.shape[0]-t_size)
            seed(self.rand_seed)
            shuffle(self.train_val_split)
            self.ps = PredefinedSplit(self.train_val_split)
        else:
            self.ps = self.cv
Esempio n. 23
0
    def RecommendByDecisionTree(train_data, train_data_y, test_data, test_data_y, recommendNum=5):
        """使用决策树
           recommendNum : 推荐数量
           max_depth 决策树最大深度
           min_samples_split 内部节点划分所需最小样本数
           min_samples_leaf 叶子节点最小样本数
           class_weight 分类权重
        """

        """设定判断参数"""

        """训练集按照3 7开分成训练集和交叉验证集"""

        """自定义验证集 而不是使用交叉验证"""
        test_fold = numpy.zeros(train_data.shape[0])
        test_fold[:ceil(train_data.shape[0] * 0.7)] = -1
        ps = PredefinedSplit(test_fold=test_fold)

        grid_parameters = [
            {'min_samples_leaf': [2, 4, 8, 16, 32, 64], 'max_depth': [2, 4, 6, 8],
             'class_weight': [None]}]  # 调节参数

        # # scores = ['precision', 'recall']  # 判断依据

        from sklearn.tree import DecisionTreeClassifier
        from sklearn.model_selection import GridSearchCV
        clf = DecisionTreeClassifier()
        clf = GridSearchCV(clf, param_grid=grid_parameters, cv=ps, n_jobs=-1)
        clf.fit(train_data, train_data_y)

        print(clf.best_params_)
        # dot_data = export_graphviz(clf, out_file=None)
        # graph = graphviz.Source(dot_data)
        # graph.render("DTree")

        pre = clf.predict_proba(test_data)
        pre_class = clf.classes_
        # print(pre)
        # print(pre_class)

        recommendList = DataProcessUtils.getListFromProbable(pre, pre_class, recommendNum)
        # print(recommendList)
        answer = [[x] for x in test_data_y]
        # print(answer)
        return [recommendList, answer]
Esempio n. 24
0
def test_fogd_softmax_gridsearch():
    print(
        "========== Tune parameters for FOGD for multiclass classification =========="
    )

    np.random.seed(random_seed())

    (x_train, y_train), (x_test, y_test) = demo.load_iris()
    print("Number of training samples = {}".format(x_train.shape[0]))
    print("Number of testing samples = {}".format(x_test.shape[0]))

    x = np.vstack((x_train, x_test))
    y = np.concatenate((y_train, y_test))

    params = {'gamma': [0.5, 1.0], 'learning_rate': [0.01, 0.5, 0.1]}

    ps = PredefinedSplit(test_fold=[-1] * x_train.shape[0] +
                         [1] * x_test.shape[0])

    clf = FOGD(model_name="FOGD_hinge",
               D=100,
               lbd=0.0,
               gamma=0.5,
               loss='hinge',
               catch_exception=True,
               random_state=random_seed())

    gs = GridSearchCV(clf, params, cv=ps, n_jobs=-1, refit=False, verbose=True)
    gs.fit(x, y)

    print("Best error {} @ params {}".format(-gs.best_score_, gs.best_params_))

    best_clf = clone(clf).set_params(**gs.best_params_)
    best_clf.fit(x_train, y_train)

    print("Mistake rate = %.4f" % best_clf.mistake)

    # offline prediction
    print("Offline prediction")
    y_train_pred = best_clf.predict(x_train)
    y_test_pred = best_clf.predict(x_test)
    train_err = 1 - metrics.accuracy_score(y_train, y_train_pred)
    test_err = 1 - metrics.accuracy_score(y_test, y_test_pred)
    print("Training error = %.4f" % train_err)
    print("Testing error = %.4f" % test_err)
Esempio n. 25
0
def test_tfglm_regression_gridsearch():
    print(
        "========== Tune parameters for TensorFlowGLM for regression =========="
    )

    np.random.seed(random_seed())

    (x_train, y_train), (x_test, y_test) = demo.load_housing()
    print("Number of training samples = {}".format(x_train.shape[0]))
    print("Number of testing samples = {}".format(x_test.shape[0]))

    x = np.vstack((x_train, x_test))
    y = np.concatenate((y_train, y_test))

    params = {'l1_penalty': [0.0, 0.0001], 'l2_penalty': [0.0001, 0.001, 0.01]}

    ps = PredefinedSplit(test_fold=[-1] * x_train.shape[0] +
                         [1] * x_test.shape[0])

    clf = TensorFlowGLM(
        model_name="TensorFlowGLM_regression_gridsearch",
        task='regression',
        link='linear',  # link function
        loss='quadratic',  # loss function
        l2_penalty=0.0,  # ridge regularization
        l1_penalty=0.0,  # Lasso regularization
        l1_smooth=1E-5,  # smoothing for Lasso regularization
        l1_method='pseudo_huber',  # approximation method for L1-norm
        learning_rate=0.0001,
        catch_exception=True,
        random_state=random_seed())

    gs = GridSearchCV(clf, params, cv=ps, n_jobs=1, refit=False, verbose=True)
    gs.fit(x, y)

    print("Best MSE {} @ params {}".format(-gs.best_score_, gs.best_params_))

    best_clf = clone(clf).set_params(**gs.best_params_)
    best_clf.fit(x_train, y_train)

    train_err = -best_clf.score(x_train, y_train)
    test_err = -best_clf.score(x_test, y_test)
    print("Training MSE = %.4f" % train_err)
    print("Testing MSE = %.4f" % test_err)
    assert abs(test_err + gs.best_score_) < 1e-4
Esempio n. 26
0
def test_rsrbm_gridsearch():
    print(
        "========== Tuning parameters for the pipeline of "
        "ReplicatedSoftmaxRBM followed by k-nearest-neighbors (kNN) =========="
    )

    np.random.seed(random_seed())

    from sklearn.pipeline import Pipeline
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import PredefinedSplit
    from sklearn.neighbors import KNeighborsClassifier

    (x_train, y_train), (x_test, y_test) = demo.load_20newsgroups()

    x = np.vstack([x_train, x_test])
    y = np.concatenate([y_train, y_test])

    estimators = [('rbm',
                   ReplicatedSoftmaxRBM(num_hidden=15,
                                        num_visible=5000,
                                        batch_size=32,
                                        num_epochs=2,
                                        learning_rate=0.001,
                                        learning_rate_hidden=0.00001,
                                        momentum_method='sudden',
                                        weight_cost=2e-4,
                                        random_state=random_seed(),
                                        verbose=0)),
                  ('knn', KNeighborsClassifier(n_neighbors=4))]

    params = dict(rbm__num_hidden=[10, 15],
                  rbm__batch_size=[64, 100],
                  knn__n_neighbors=[1, 2])

    ps = PredefinedSplit(test_fold=[-1] * x_train.shape[0] +
                         [1] * x_test.shape[0])

    clf = Pipeline(estimators)

    gs = GridSearchCV(clf, params, cv=ps, n_jobs=-1, refit=False, verbose=True)
    gs.fit(x, y)

    print("Best error {} @ params {}".format(1.0 - gs.best_score_,
                                             gs.best_params_))
Esempio n. 27
0
def computePredefinedSplit(dataset, parameters):
    tenFold = TenFoldArffFile(dataset)
    X = None
    Y = None
    foldIdx = 0
    while tenFold.loadNextFold():
        xTrain, yTrain, xTest, yTest = getFoldData(tenFold)
        if X is None:
            X = np.concatenate([xTrain, xTest])
            Y = np.concatenate([yTrain, yTest])
            indexes = np.full(X.shape[0], -1)
        xTrain = xTrain.to_numpy()
        xTest = xTest.to_numpy()
        for item in xTest:
            index = np.where((X == item).all(axis=1))[0]
            indexes[index] = foldIdx
        foldIdx += 1
    return X, Y, PredefinedSplit(indexes)
Esempio n. 28
0
def get_n_fold_by_drugs(all_drugs, n_splits=5):
    unique_drugs = np.unique(all_drugs, axis=0)
    test_folds = np.ones(all_drugs.shape[0])
    kf = KFold(n_splits, random_state=15)

    j = 0
    for _, validation_drugs in kf.split(np.arange(unique_drugs.shape[0])):
        val_inds = []

        for drug_ind in validation_drugs:
            willbe_added = list(
                np.where((~(all_drugs == unique_drugs[drug_ind, :])).sum(
                    axis=1) == 0)[0])
            val_inds += willbe_added
        test_folds[val_inds] = j
        j += 1

    return PredefinedSplit(test_folds)
Esempio n. 29
0
 def gridsearch_method(self,df):
     code = df.iloc[0]["product_code"]
     if code not in self.gs_code:
         self.gs_code.append(code)
         df.sort_index(inplace=True)
         from sklearn.model_selection import GridSearchCV
         from sklearn.model_selection import PredefinedSplit
         from sklearn.ensemble import GradientBoostingRegressor
         train_feature = df.head(df.shape[0] - 18).iloc[:][self.train_col]
         train_real = df.head(df.shape[0] - 18).iloc[:]["True_volume"]
         val_split = np.zeros(train_feature.shape[0])
         val_split[:(train_feature.shape[0] - 18)] = -1
         ps = PredefinedSplit(test_fold=val_split)
         GBR = GradientBoostingRegressor(random_state=0)
         self.clf[code] = GridSearchCV(GBR, self.param, scoring='neg_mean_absolute_error', cv=ps)
         self.clf[code].fit(train_feature, train_real)
         print(code, self.clf[code].best_params_)
     pass
Esempio n. 30
0
    def cv_score(self, leaf, lay, bootstrap):
        balance = {0: 1, 1: 1.4}
        self.forest = RandomForestClassifier(n_estimators=100,
                                             min_samples_leaf=leaf,
                                             max_depth=lay,
                                             class_weight=balance,
                                             bootstrap=bootstrap)

        cv_score = cross_validate(self.forest,
                                  self.train_set_params,
                                  self.train_set_labels,
                                  cv=PredefinedSplit(self.stock_kfold_idxs),
                                  return_train_score=True)
        train_score, test_score = cv_score["train_score"], cv_score[
            "test_score"]
        print(
            f"The training score was {np.mean(train_score)} and the validation score {np.mean(test_score)}"
        )