Python GridSearch.predict Examples

Programming Language: Python

Namespace/Package Name: hypopt

Class/Type: GridSearch

Method/Function: predict

Examples at hotexamples.com: 5

Python GridSearch.predict - 5 examples found. These are the top rated real world Python examples of hypopt.GridSearch.predict extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

GridSearch(21)

fit(16)

score(7)

predict(5)

get_best_params(2)

get_best_score(1)

get_param_scores(1)

get_params(1)

predict_proba(1)

transform(1)

Example #1

Show file

def test_prob_methods():
    data = load_breast_cancer()

    # Create test and train sets from one dataset
    X_train, X_test, y_train, y_test = train_test_split(
        data["data"],
        data["target"],
        test_size=0.3,
        random_state=0,
        stratify=data["target"],
    )

    # List the parameters to search across
    param_grid = {'C': [1, 10, 100, 120, 150]}

    # Grid-search all parameter combinations using a validation set.
    model = GridSearch(
        model=LogisticRegression(),
        param_grid=param_grid,
    )
    model.fit(X_train, y_train, verbose=False)

    assert (model.predict(X_test) is not None)
    assert (model.predict_proba(X_test) is not None)

Example #2

Show file

File: Prediction.py Project: blukitas/groupby-banco-

class Prediction:
    def __init__(self, data, model, prefix, param_grid=[]):
        self.train_df, self.test_df = data
        self.model = model
        self.param_grid = param_grid
        self.prefix = prefix + datetime.now().strftime('%m-%d-%H:%M')
        self.X = self.train_df.loc[:, self.train_df.columns != 'precio']
        self.y = self.train_df['precio'].values
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
            self.X, self.y, test_size=0.1, random_state=1)

    def manualGridSearch(self):
        best_score = math.inf
        for g in self.param_grid:
            print(g)
            self.model.set_params(**g)
            self.model.fit(self.X_train, self.y_train)
            score = mean_absolute_error(self.model.predict(self.X_val),
                                        self.y_val)
            print(score)
            # save if best
            if score < best_score:
                self.best_score = score
                self.best_grid = g

    def gridSearchTrain(self):
        print('Training...')
        self.gscv = GridSearchCV(self.model,
                                 self.param_grid,
                                 scoring='neg_mean_absolute_error',
                                 verbose=10)
        self.gscv.fit(self.X_train, self.y_train)
        self.best_params = self.gscv.best_params_
        self.score = self.gscv.best_score_
        self.predicted = self.gscv.predict(self.test_df)
        print(self.best_params)
        print(self.score)

    def HypOptTrain(self):
        print('Training...')
        self.opt = GridSearch(model=self.model, param_grid=self.param_grid)
        self.opt.fit(self.X_train,
                     self.y_train,
                     self.X_val,
                     self.y_val,
                     scoring='neg_mean_squared_error')
        self.best_params = self.opt.best_params_
        self.score = self.opt.score(X_val, y_val)
        self.predicted = self.opt.predict(self.test_df)
        print(self.best_params)
        print(self.score)

    def train(self):
        print('Training...')
        self.model.fit(self.X_train, self.y_train)
        self.score = mean_absolute_error(self.model.predict(self.X_val),
                                         self.y_val)
        print(self.score)
        self.predicted = self.model.predict(self.test_df)

    def crossValidation(self, cv=5):
        cv_scores = cross_val_score(
            self.model,
            self.X,
            self.y,
            cv=cv,
            scoring='neg_mean_absolute_error'
        )  #print each cv score (accuracy) and average them
        self.score = np.mean(cv_scores)
        print(self.score)

    def save(self):
        if self.param_grid == []:
            with open('{}.model'.format(self.prefix), 'wb') as f:
                pickle.dump(self.model, f)
        else:
            with open('{}.model'.format(self.prefix), 'wb') as f:
                pickle.dump(self.gscv, f)

    def submit(self):
        self.test_ids = pd.read_csv('data/test.csv')['id']
        answer = pd.DataFrame(list(zip(self.test_ids, self.predicted)),
                              columns=['id', 'target'])
        answer.to_csv('{}-{}.csv'.format(self.prefix, int(round(self.score))),
                      sep=',',
                      index=False)

Example #3

Show file

File: general_machine_learning.py Project: jlevy44/PyMethylProcess

class MachineLearning:
    """Machine learning class to run sklearn-like pipeline on MethylationArray data.
    Initialize object with scikit-learn model, and optionally supply a hyperparameter search grid.

    model
        Scikit-learn-like model, classification, regression, dimensionality reduction, clustering etc.
    options
        Options to supply model in form of dictionary.
    grid
        Alternatively, supply search grid to search for bets hyperparameters.
    labelencode
        T/F encode string labels.
    n_eval
        Number of evaluations for randomized grid search, if set to 0, perform exhaustive grid search
    """
    def __init__(self, model, options, grid={}, labelencode=False, n_eval=0):
        if grid:
            self.model = GridSearch(
                model=model(),
                param_grid=grid,
                num_random_search=None if not n_eval else n_eval)
            self.param_grid_exists = True
            self.grid = grid
        else:
            self.model = model(**options)
            self.param_grid_exists = False
        if labelencode:
            self.encoder = LabelEncoder()
        else:
            self.encoder = None

    def fit(self,
            train_methyl_array,
            val_methyl_array=None,
            outcome_cols=None):
        """Fit data to model.

        Parameters
        ----------
        train_methyl_array
            Training MethylationArray.
        val_methyl_array
            Validation MethylationArray. Can set to None.
        outcome_cols
            Set to none if not needed, but phenotype column to train on, can be multiple.
        """
        if outcome_cols != None:
            if self.encoder != None:
                self.encoder.fit(train_methyl_array.pheno[outcome_cols])
            if self.param_grid_exists:
                self.model.fit(
                    train_methyl_array.beta,
                    self.encoder.transform(
                        train_methyl_array.pheno[outcome_cols])
                    if self.encoder != None else
                    train_methyl_array.pheno[outcome_cols],
                    val_methyl_array.beta,
                    self.encoder.transform(
                        val_methyl_array.pheno[outcome_cols])
                    if self.encoder != None else
                    val_methyl_array.pheno[outcome_cols],
                    scoring='accuracy' if self.encoder != None else 'r2')
            else:
                self.model.fit(
                    train_methyl_array.beta,
                    self.encoder.transform(
                        train_methyl_array.pheno[outcome_cols])
                    if self.encoder != None else
                    train_methyl_array.pheno[outcome_cols])
        else:
            self.model.fit(train_methyl_array.beta)
        return self.model

    def transform(self, test_methyl_array):
        """Transform test methylation array.

        Parameters
        ----------
        test_methyl_array
            Testing MethylationArray.
        """
        self.results = self.model.transform(test_methyl_array.beta)
        return self.results

    def fit_transform(self, train_methyl_array, outcome_cols=None):
        """Fit and transform to training data.

        Parameters
        ----------
        train_methyl_array
            Training MethylationArray.
        outcome_cols
            Set to none if not needed, but phenotype column to train on, can be multiple.
        """
        self.results = self.fit(
            train_methyl_array,
            outcome_cols=None).transform(train_methyl_array)
        return self.results

    def predict(self, test_methyl_array):
        """Make new predictions on test methylation array.

        Parameters
        ----------
        test_methyl_array
            Testing MethylationArray.
        """
        self.results = self.model.predict(test_methyl_array.beta)
        if self.encoder != None:
            self.results = self.encoder.inverse_transform(self.results)
        return self.results

    def fit_predict(self, train_methyl_array, outcome_cols=None):
        """Fit and predict training data.

        Parameters
        ----------
        train_methyl_array
            Training MethylationArray.
        outcome_cols
            Set to none if not needed, but phenotype column to train on, can be multiple.
        """
        self.results = self.fit(train_methyl_array,
                                outcome_cols).predict(train_methyl_array)
        return self.results

    def store_results(self, output_pkl, results_dict={}):
        """Store results in pickle file.

        Parameters
        ----------
        output_pkl
            Output pickle to dump results to.
        results_dict
            Supply own results dict to be dumped.
        """
        if not results_dict:
            results_dict = dict(results=self.results)
        pickle.dump(results_dict, open(results_dict, 'wb'))

    def assign_results_to_pheno_col(self, methyl_array, new_col, output_pkl):
        """Assign results to new phenotype column.

        Parameters
        ----------
        methyl_array
            MethylationArray.
        new_col
            New column name.
        output_pkl
            Output pickle to dump MethylationArray to.
        """
        methyl_array.pheno[new_col] = self.results
        methyl_array.write_pickle(output_pkl)

    def transform_results_to_beta(self, methyl_array, output_pkl):
        """Transform beta matrix into reduced beta matrix and store.

        Parameters
        ----------
        methyl_array
            MethylationArray.
        output_pkl
            Output pickle to dump MethylationArray to.
        """
        methyl_array.beta = pd.DataFrame(self.results, index=self.beta.index)
        methyl_array.write_pickle(output_pkl)

    def return_outcome_metric(self,
                              methyl_array,
                              outcome_cols,
                              metric,
                              run_bootstrap=False):
        """Supply metric to evaluate results.

        Parameters
        ----------
        methyl_array
            MethylationArray to evaluate.
        outcome_cols
            Outcome phenotype columns.
        metric
            Sklearn evaluation metric.
        run_bootstrap
            Make 95% CI from 1k bootstraps.
        """
        y_true = methyl_array.pheno[outcome_cols]
        y_pred = self.results
        if not bootstrap:
            return metric(y_true, y_pred)
        else:
            from sklearn.utils import resample
            boot_results = np.array([
                metric(*resample(y_true, y_pred, random_state=123))
                for i in range(n_bootstrap)
            ])
            original = metric(y_true, y_pred)
            std_err = np.std(boot_results)
            boot_results = np.sort(boot_results)
            ci = 0.95
            bound = (1 - ci) / 2.

            # BORROWED FROM MLXTEND
            def quantile(x, q):
                rank = round(q * x.shape[0]) - 1
                if rank >= x.shape[0]:
                    rank = x.shape[0]
                elif rank <= 0:
                    rank = 0
                rank = int(round(rank))
                return x[rank]

            high_ci = quantile(boot_results, q=(ci + bound))
            low_ci = quantile(boot_results, q=bound)
            return original, std_err, (low_ci, high_ci)

Example #4

Show file

File: classifiers_and_graphs.py Project: zhengjul/CSE802-Project-

def classifier(classifier, train, truth, validate, validate_truth, test,
               test_truth, datatype):
    np.random.seed(0)
    rng = np.random.permutation(1)[0]
    train = pd.DataFrame(train)
    validate = pd.DataFrame(validate)
    test = pd.DataFrame(test)
    logger = logging.getLogger('myapp')
    hdlr = logging.FileHandler('classifiers.log')
    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    hdlr.setFormatter(formatter)
    logger.addHandler(hdlr)
    logger.setLevel(logging.WARN)
    if classifier.lower(
    ) == 'svm':  #best: C = 50, gamma = 0.0001, kernel = rbf
        model = svm.SVC(random_state=rng)
        hyperparameter = {
            'kernel': ('linear', 'rbf'),
            'C': [1, 1.5, 10, 50, 100, 200],
            'gamma': [1e-7, 1e-4]
        }
    elif classifier.lower() == 'randomforest':  #120
        model = RandomForestClassifier(random_state=rng)
        hyperparameter = {'n_estimators': np.arange(10, 300, 10)}
    elif classifier.lower() == 'adaboost':
        model = AdaBoostClassifier(random_state=rng)
        hyperparameter = {
            'n_estimators': np.arange(10, 300, 10),
            'algorithm': ('SAMME', 'SAMME.R')
        }
    elif classifier.lower() == 'knn':  #120
        model = KNeighborsClassifier()
        hyperparameter = dict(n_neighbors=list(range(1, 100)))
    else:  ## assume it's asking for neural network (multi-layer perceptron)
        model = MLPClassifier(
            max_iter=100
        )  #activation=tanh, hiddenlayersize=(20,20), 'learning_rate'=adaptive,solver=lbfgs
        hyperparameter = {
            'hidden_layer_sizes': [(20, 20), (80, 20), (80, 20, 20),
                                   (80, 40, 40, 20), (40, 40, 20, 20, 20, 10)],
            'learning_rate': ['adaptive'],
            'activation': ['tanh', 'relu', 'logistic'],
            'solver': ['lbfgs', 'sgd', 'adam']
        }
    tuned_model = GridSearch(model=model, param_grid=hyperparameter)
    tuned_model.fit(train, truth)
    prediction = tuned_model.score(test, test_truth)
    logger.warn(classifier + ' ' + datatype + ' validate    ' +
                str(prediction))
    tuned_model.fit(train, truth, validate, validate_truth)
    prediction = tuned_model.score(test, test_truth)
    target_names = [
        'c-CS-s', 'c-CS-m', 'c-SC-s', 'c-SC-m', 't-CS-s', 't-CS-m', 't-SC-s',
        't-SC-m'
    ]
    prediction = tuned_model.predict(test)
    print(
        classification_report(test_truth,
                              prediction,
                              target_names=target_names))
    logger.warn(classifier + ' ' + datatype + '    ' + str(prediction))
    return

Example #5

Show file

File: sentiment_eval.py Project: lksenel/imparting-interpretability

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--vectors_file', required=True, type=str)
    parser.add_argument('--data_path',
                        default='stanfordSentimentTreebank',
                        type=str)
    parser.add_argument('--output_file', required=True, type=str)
    args = parser.parse_args()

    try:
        vectors
    except NameError:
        print('Reading vectors file ...  ', end='')
        t = time.time()
        with codecs.open(args.vectors_file, 'r', "UTF-8") as f:
            vocab_size = sum(1 for line in f)

        with codecs.open(args.vectors_file, 'r', "UTF-8") as f:
            line = f.readline()
            val = line.rstrip().split(' ')
            check = False
            if len(
                    val
            ) == 2:  # Check if the vectors file has vocab size and diensionality in the first line
                val = f.readline().rstrip().split(' ')
                vocab_size -= 1
                check = True
            vector_dim = len(list(map(float, val[1:])))

        vectors = np.zeros((vocab_size, vector_dim))

        words = [""] * vocab_size
        vocab_dict = dict()
        with codecs.open(args.vectors_file, 'r', "UTF-8") as f:
            if check:
                next(f)
            for idx, line in enumerate(f):
                vals = line.rstrip().split(' ')

                words[idx] = vals[0]
                vocab_dict[vals[0]] = idx  # indices start from 0
                vec = list(map(float, vals[1:]))
                try:
                    vectors[idx, :] = vec
                except IndexError:
                    if vals[0] == '<unk>':  # ignore the <unk> vector
                        pass
                    else:
                        raise Exception('IncompatibleInputs')

        print("done in " + str(int(time.time() - t)) + " seconds")

    print('Reading train and test data ...  ', end='')
    t = time.time()
    dictionary = dict()
    with codecs.open(args.data_path + "/dictionary.txt", 'r', "UTF-8") as f:
        for line in f.read().splitlines():
            tmp = line.split("|")
            dictionary[tmp[0]] = int(tmp[1])

    with codecs.open(args.data_path + "/datasetSentences.txt", "r",
                     "UTF-8") as f:
        sentences = []
        for sentence in f.read().splitlines()[1:]:
            sentences.append(sentence.split("\t")[1])

    all_labels = []
    with open(args.data_path + "/sentiment_labels.txt") as f:
        for label in f.read().splitlines()[1:]:
            all_labels.append(float(label.split("|")[1]))

    split_classes = []
    with open(args.data_path + "/datasetSplit.txt") as f:
        for line in f.read().splitlines()[1:]:
            split_classes.append(int(line.split(",")[1]))

    print("done in " + str(int(time.time() - t)) + " seconds")

    print(
        'Generating train and test samples from the data for selected classes ...  ',
        end='')
    t = time.time()

    train_size = sum([1 for label in split_classes if label == 1])
    val_size = sum([1 for label in split_classes if label == 3])
    test_size = sum([1 for label in split_classes if label == 2])

    train_samples = np.zeros([train_size, vector_dim])
    train_labels = []

    val_samples = np.zeros([val_size, vector_dim])
    val_labels = []

    test_samples = np.zeros([test_size, vector_dim])
    test_labels = []

    train_no = 0
    val_no = 0
    test_no = 0
    not_in_dict_count = 0
    for sample_no, sentence in enumerate(sentences):
        try:
            score = all_labels[dictionary[sentence]]
        except:
            not_in_dict_count += 1
            continue

        if score <= 0.4 or score > 0.6:  # Eliminate noutral sentences
            inds = process_sentence(sentence, vocab_dict)
            if len(inds) > 0:
                if split_classes[sample_no] == 1:
                    for ind in inds:
                        train_samples[train_no, :] += vectors[ind, :]

                    train_samples[
                        train_no, :] = train_samples[train_no, :] / len(inds)

                    if score <= 0.4:
                        train_labels.append(0)
                    elif score > 0.6:
                        train_labels.append(1)

                    train_no += 1

                elif split_classes[sample_no] == 3:
                    for ind in inds:
                        val_samples[val_no, :] += vectors[ind, :]

                    val_samples[val_no, :] = val_samples[val_no, :] / len(inds)

                    if score <= 0.4:
                        val_labels.append(0)
                    elif score > 0.6:
                        val_labels.append(1)

                    val_no += 1

                elif split_classes[sample_no] == 2:
                    for ind in inds:
                        test_samples[test_no, :] += vectors[ind, :]

                    test_samples[
                        test_no, :] = test_samples[test_no, :] / len(inds)

                    if score <= 0.4:
                        test_labels.append(0)
                    elif score > 0.6:
                        test_labels.append(1)

                    test_no += 1

    train_samples = train_samples[:train_no, :]
    val_samples = val_samples[:val_no, :]
    test_samples = test_samples[:test_no, :]

    print("done in " + str(int(time.time() - t)) + " seconds")

    print('Training linear SVM for parameter optimization ... ', end='')

    tuned_parameters = [{
        'kernel': ['linear'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    }]
    clf = GridSearch(model=SVC(), param_grid=tuned_parameters)
    clf.fit(train_samples, train_labels, val_samples, val_labels)

    print("done in " + str(int(time.time() - t)) + " seconds")

    predicted_labels = clf.predict(test_samples)
    accuracy = sum([
        true == predicted
        for true, predicted in zip(test_labels, predicted_labels)
    ]) / len(test_samples) * 100

    print("Accuracy for sentiment classification of sentences is: " +
          str(round(accuracy, 2)) + "% (" +
          str(int(accuracy / 100 * len(predicted_labels))) + "/" +
          str(len(predicted_labels)) + ")")

    f_out = open(args.output_file, "w")
    f_out.write("Accuracy for sentiment classification is: " +
                str(round(accuracy, 2)) + "% (" +
                str(int(accuracy / 100 * len(predicted_labels))) + "/" +
                str(len(predicted_labels)) + ")\n")
    f_out.close()