Esempio n. 1
0
    def mlp_classifier(self):
        def get_hidden_layers():
            
            x = [64, 128, 256]
            hl = []

            for i in range(1, len(x)):
                hl.extend([p for p in itertools.product(x, repeat=i+1)])

            return hl
        
        clf = MLPClassifier(solver='adam', alpha=1e-5, early_stopping=True, \
                            random_state=self.random_state)
        
        hidden_layer_sizes = get_hidden_layers()
        params = {'hidden_layer_sizes': hidden_layer_sizes}

        mdl = GridSearch(model=clf, param_grid=params)
        
        mdl.fit(self.x_train, self.y_train, self.x_val, self.y_val)

        self.reports['mlp'] = self.get_results(mdl)
        
        model_path = os.path.join(MODELS, dir_name, file_name+'.mlp')
        joblib.dump(mdl, model_path)
Esempio n. 2
0
 def HypOptTrain(self):
     print('Training...')
     self.opt = GridSearch(model=self.model, param_grid=self.param_grid)
     self.opt.fit(self.X_train,
                  self.y_train,
                  self.X_val,
                  self.y_val,
                  scoring='neg_mean_squared_error')
     self.best_params = self.opt.best_params_
     self.score = self.opt.score(X_val, y_val)
     self.predicted = self.opt.predict(self.test_df)
     print(self.best_params)
     print(self.score)
Esempio n. 3
0
def test_gridsearch_crossval(
    model=SVC(random_state=0),
    return_model=False,
    param_grid=None,
    opt_score=0.9298,
    assertions=True,
    scoring=None,
    verbose=False,
):
    data = load_breast_cancer()

    # Create test and train sets from one dataset
    X_train, X_test, y_train, y_test = train_test_split(
        data["data"],
        data["target"],
        test_size=0.3,
        random_state=0,
        stratify=data["target"],
    )

    # List the parameters to search across
    if param_grid is None:
        param_grid = {
            'C': [1, 10, 100, 120, 150],
            'gamma': [0.001, 0.0001],
            'kernel': ['rbf'],
        }

    # Grid-search all parameter combinations WITHOUT a validation set.
    gs = GridSearch(
        model=model,
        param_grid=param_grid,
    )
    gs.fit(X_train, y_train, scoring=scoring, verbose=False)

    # Compare with default model without hyperopt
    default = SVC(random_state=0)
    default.fit(X_train, y_train)

    default_score = round(default.score(X_test, y_test), 4)
    gs_score = round(gs.score(X_test, y_test), 4)

    if verbose:
        print('Default score:', default_score, '| GridSearch Score:', gs_score)

    if assertions:
        assert (gs_score == opt_score)

    if return_model:
        return gs
 def __init__(self, model, options, grid={}, labelencode=False, n_eval=0):
     if grid:
         self.model = GridSearch(
             model=model(),
             param_grid=grid,
             num_random_search=None if not n_eval else n_eval)
         self.param_grid_exists = True
         self.grid = grid
     else:
         self.model = model(**options)
         self.param_grid_exists = False
     if labelencode:
         self.encoder = LabelEncoder()
     else:
         self.encoder = None
Esempio n. 5
0
    def rf_regressor(self):

        clf = RandomForestRegressor(random_state=self.random_state)

        params = {'n_estimators': [int(x) for x in np.linspace(start=100, stop=800, num=8)],\
                        'max_features':['auto', 'sqrt', 'log2']}

        mdl = GridSearch(model=clf, param_grid=params)

        print('Fitting Rf')
        mdl.fit(self.x_train, self.y_train, self.x_val, self.y_val)
        self.reports['rf'] = self.get_results(mdl)

        model_path = os.path.join(MODELS, dir_name, file_name + '.rf')

        joblib.dump(mdl, model_path)
Esempio n. 6
0
    def   xgb_classifier(self):
        
        clf = xgb.XGBClassifier(random_state=self.random_state)
        
        params = {
            'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.3, 0.5],\
            'n_estimators': [int(x) for x in np.linspace(start=50, stop=800, num=16)],\
            'colsample_bytree': [i/10.0 for i in range(3,11)]}

        mdl = GridSearch(model=clf, param_grid=params)
        print('Fitting XGBoost')

        mdl.fit(self.x_train, self.y_train)

        mdl.fit(self.x_train, self.y_train, self.x_val, self.y_val)

        self.reports['xgb'] = self.get_results(mdl)
                
        model_path = os.path.join(MODELS, dir_name, file_name+'.xgb')
        joblib.dump(mdl, model_path)
Esempio n. 7
0
def test_prob_methods():
    data = load_breast_cancer()

    # Create test and train sets from one dataset
    X_train, X_test, y_train, y_test = train_test_split(
        data["data"],
        data["target"],
        test_size=0.3,
        random_state=0,
        stratify=data["target"],
    )

    # List the parameters to search across
    param_grid = {'C': [1, 10, 100, 120, 150]}

    # Grid-search all parameter combinations using a validation set.
    model = GridSearch(
        model=LogisticRegression(),
        param_grid=param_grid,
    )
    model.fit(X_train, y_train, verbose=False)

    assert (model.predict(X_test) is not None)
    assert (model.predict_proba(X_test) is not None)
Esempio n. 8
0
# ### Grid-search time comparison using validation set versus cross-validation.
# #### The hyperopt package automatically distributes work on all CPU threads regardless of if you use a validation set or cross-validation.

# In[4]:

print("First let's try the neural network with default parameters.")
default = MLPClassifier(max_iter=6, random_state=0)
get_ipython().magic(u'time default.fit(X_train, y_train)')
test_score = round(default.score(X_test, y_test), 4)
val_score = round(default.score(X_val, y_val), 4)
print('\nTEST SCORE (default parameters):', test_score)
print('VALIDATION SCORE (default parameters):', val_score)

# In[5]:

gs_val = GridSearch(model=MLPClassifier(max_iter=6, random_state=0))
print("Grid-search using a validation set.\n", "-" * 79)
get_ipython().magic(
    u'time gs_val.fit(X_train, y_train, param_grid, X_val, y_val)')
test_score = round(gs_val.score(X_test, y_test), 4)
val_score = round(gs_val.score(X_val, y_val), 4)
print('\nTEST SCORE (hyper-parameter optimization with validation set):',
      test_score)
print('VALIDATION SCORE (hyper-parameter optimization with validation set):',
      val_score)

# In[6]:

gs_cv = GridSearch(model=MLPClassifier(max_iter=6, random_state=0), cv_folds=5)
print(
    "\n\nLet's see how long grid-search takes to run when we don't use a validation set."
Esempio n. 9
0
class Prediction:
    def __init__(self, data, model, prefix, param_grid=[]):
        self.train_df, self.test_df = data
        self.model = model
        self.param_grid = param_grid
        self.prefix = prefix + datetime.now().strftime('%m-%d-%H:%M')
        self.X = self.train_df.loc[:, self.train_df.columns != 'precio']
        self.y = self.train_df['precio'].values
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
            self.X, self.y, test_size=0.1, random_state=1)

    def manualGridSearch(self):
        best_score = math.inf
        for g in self.param_grid:
            print(g)
            self.model.set_params(**g)
            self.model.fit(self.X_train, self.y_train)
            score = mean_absolute_error(self.model.predict(self.X_val),
                                        self.y_val)
            print(score)
            # save if best
            if score < best_score:
                self.best_score = score
                self.best_grid = g

    def gridSearchTrain(self):
        print('Training...')
        self.gscv = GridSearchCV(self.model,
                                 self.param_grid,
                                 scoring='neg_mean_absolute_error',
                                 verbose=10)
        self.gscv.fit(self.X_train, self.y_train)
        self.best_params = self.gscv.best_params_
        self.score = self.gscv.best_score_
        self.predicted = self.gscv.predict(self.test_df)
        print(self.best_params)
        print(self.score)

    def HypOptTrain(self):
        print('Training...')
        self.opt = GridSearch(model=self.model, param_grid=self.param_grid)
        self.opt.fit(self.X_train,
                     self.y_train,
                     self.X_val,
                     self.y_val,
                     scoring='neg_mean_squared_error')
        self.best_params = self.opt.best_params_
        self.score = self.opt.score(X_val, y_val)
        self.predicted = self.opt.predict(self.test_df)
        print(self.best_params)
        print(self.score)

    def train(self):
        print('Training...')
        self.model.fit(self.X_train, self.y_train)
        self.score = mean_absolute_error(self.model.predict(self.X_val),
                                         self.y_val)
        print(self.score)
        self.predicted = self.model.predict(self.test_df)

    def crossValidation(self, cv=5):
        cv_scores = cross_val_score(
            self.model,
            self.X,
            self.y,
            cv=cv,
            scoring='neg_mean_absolute_error'
        )  #print each cv score (accuracy) and average them
        self.score = np.mean(cv_scores)
        print(self.score)

    def save(self):
        if self.param_grid == []:
            with open('{}.model'.format(self.prefix), 'wb') as f:
                pickle.dump(self.model, f)
        else:
            with open('{}.model'.format(self.prefix), 'wb') as f:
                pickle.dump(self.gscv, f)

    def submit(self):
        self.test_ids = pd.read_csv('data/test.csv')['id']
        answer = pd.DataFrame(list(zip(self.test_ids, self.predicted)),
                              columns=['id', 'target'])
        answer.to_csv('{}-{}.csv'.format(self.prefix, int(round(self.score))),
                      sep=',',
                      index=False)
        # features = vect_val.get_feature_names()
        # for feature in features:
        #     print(feature)
        # print('feature counts: {0}'.format(len(features)))

        X_train = vect_val.transform(X_train)
        X_val = vect_val.transform(X_val)
        print('******** GridSearch ********')
        param_grid = {
            'n_estimators': [40, 60, 80, 100, 120],
            'learning_rate': [0.1, 0.15, 0.2],
            'max_depth': [6, 7, 8, 9, 10]
        }
        scorer = make_scorer(f2)

        gs = GridSearch(model=GradientBoostingClassifier())
        gs.fit(X_train, y_train, param_grid, X_val, y_val, scoring=scorer)
        print('params: ', gs.get_best_params())
        print('Test Score for Optimized Parameters:', gs.score(X_val, y_val))

    # print('******** GradientBoostingClassifier ********')
    # gb = GradientBoostingClassifier(learning_rate=0.1, n_estimators=80, max_depth=7)
    # gb, preds_train, preds = train_and_predict(gb, X_train, y_train, X_val)
    # print_scores(y_val, preds, y_train, preds_train)
    #
    # print('******** AdamBoostingClassifier ********')
    # ada = AdaBoostClassifier()
    # ada, preds_train, preds = train_and_predict(ada, X_train, y_train, X_val)
    # print_scores(y_val, preds, y_train, preds_train)
    #
    # print('******** XgBoostClassifier ********')
Esempio n. 11
0
# Create a validation set.
X_train, X_val, y_train, y_val = train_test_split(
    X_train, 
    y_train, 
    test_size = 0.3, 
    random_state = 0,
    stratify = y_train,
)

# List the parameters to search across
# List the parameters to search across
param_grid = {
    'C': [1, 10, 100, 120, 150], 
    'gamma': [0.001, 0.0001], 
    'kernel': ['rbf'],
}

# Grid-search all parameter combinations using a validation set.
gs = GridSearch(model = SVC(random_state=0), param_grid=param_grid, parallelize=False)
# You can choose the metric to optimize (f1, auc_roc, accuracy, etc.)
# scoring = None will default to optimizing model.score()
_ = gs.fit(X_train, y_train, X_val, y_val, scoring = 'f1')

# Compare with default model without hyperopt
default = SVC(random_state=0)
_ = default.fit(X_train, y_train)
print('\nTest score comparison (larger is better):')
print('Non-optimized Parameters:', round(default.score(X_test, y_test), 4))
print('Optimized Parameters:', round(gs.score(X_test, y_test), 4))

Esempio n. 12
0
# ### The hyperopt package automatically distributes work on all CPU threads regardless of if you use a validation set or cross-validation.

# In[4]:

print("First let's try the neural network with default parameters.")
default = MLPClassifier(max_iter=50, random_state=0)
default.fit(X_train, y_train)
test_score = round(default.score(X_test, y_test), 4)
val_score = round(default.score(X_val, y_val), 4)
print('\nTEST SCORE (default parameters):', test_score)
print('VALIDATION SCORE (default parameters):', val_score)

# In[5]:


gs_val = GridSearch(model = MLPClassifier(max_iter=50, random_state=0), param_grid=param_grid,\
     parallelize=False)
print("Grid-search using a validation set.\n", "-" * 79)
get_ipython().magic(
    u"time gs_val.fit(X_train, y_train, X_val, y_val, scoring = 'accuracy')")
test_score = round(gs_val.score(X_test, y_test), 4)
val_score = round(gs_val.score(X_val, y_val), 4)
print('\nTEST SCORE (hyper-parameter optimization with validation set):',
      test_score)
print('VALIDATION SCORE (hyper-parameter optimization with validation set):',
      val_score)

# In[6]:

gs_cv = GridSearch(model=MLPClassifier(max_iter=50, random_state=0),
                   param_grid=param_grid,
                   cv_folds=6)
Esempio n. 13
0
print("Length of Train/Test:",len(X_train),len(X_dev))

params = {
    'lr' : np.random.uniform(0,1,[5]).tolist() + [1.0],
    'ada' : np.random.uniform(0,1,[5]).tolist() + [1.0],
    'et' : np.random.uniform(0,1,[5]).tolist() + [1.0],
    'xgb' : np.random.uniform(0,1,5).tolist() + [1.0],
    'gb' : np.random.uniform(0,1,[5]).tolist() + [1.0],
    'rf' : np.random.uniform(0,1,[5]).tolist() + [1.0], 
    'br' : np.random.uniform(0,1,5).tolist() + [1.0], 
    'lasso' : np.random.uniform(0,1,[5]).tolist() + [1.0], 
    'lrf' : [math.floor,math.ceil,avg],
    'adaf' : [math.floor,math.ceil,avg],
    'etf' : [math.floor,math.ceil,avg],
    'xgbf' : [math.floor,math.ceil,avg],
    'gbf' : [math.floor,math.ceil,avg],
    'rff' :[math.floor,math.ceil,avg],
    'brf' : [math.floor,math.ceil,avg],
    'lassof' : [math.floor,math.ceil,avg],
}


model = GridSearch(N2C2Classifier(),param_grid=params)

model.fit(X_train,y_train,X_dev,y_dev,verbose=True)

print(model.get_best_params())
print(model.get_best_score())


    random_state=0,
)

# Create a validation set.
X_train, X_val, y_train, y_val = train_test_split(
    X_train,
    y_train,
    test_size=0.1,
    random_state=0,
)

# List the parameters to search across
param_grid = {
    'C': [1, 10, 100, 120, 150],
    'gamma': [0.001, 0.0001],
    'kernel': ['rbf'],
}

# Grid-search all parameter combinations using a validation set.
gs = GridSearch(model=SVR())
# Choose the metric to optimize (r2, explained_variance, etc.)
# scoring = None will default to optimizing model.score()
_ = gs.fit(X_train, y_train, param_grid, X_val, y_val, scoring='r2')

# Compare with default model without hyperopt
default = SVR()
_ = default.fit(X_train, y_train)
print('\nTest score comparison (larger is better):')
print('Non-optimized Parameters:', round(default.score(X_test, y_test), 4))
print('Optimized Parameters:', round(gs.score(X_test, y_test), 4))
def classifier(classifier, train, truth, validate, validate_truth, test,
               test_truth, datatype):
    np.random.seed(0)
    rng = np.random.permutation(1)[0]
    train = pd.DataFrame(train)
    validate = pd.DataFrame(validate)
    test = pd.DataFrame(test)
    logger = logging.getLogger('myapp')
    hdlr = logging.FileHandler('classifiers.log')
    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    hdlr.setFormatter(formatter)
    logger.addHandler(hdlr)
    logger.setLevel(logging.WARN)
    if classifier.lower(
    ) == 'svm':  #best: C = 50, gamma = 0.0001, kernel = rbf
        model = svm.SVC(random_state=rng)
        hyperparameter = {
            'kernel': ('linear', 'rbf'),
            'C': [1, 1.5, 10, 50, 100, 200],
            'gamma': [1e-7, 1e-4]
        }
    elif classifier.lower() == 'randomforest':  #120
        model = RandomForestClassifier(random_state=rng)
        hyperparameter = {'n_estimators': np.arange(10, 300, 10)}
    elif classifier.lower() == 'adaboost':
        model = AdaBoostClassifier(random_state=rng)
        hyperparameter = {
            'n_estimators': np.arange(10, 300, 10),
            'algorithm': ('SAMME', 'SAMME.R')
        }
    elif classifier.lower() == 'knn':  #120
        model = KNeighborsClassifier()
        hyperparameter = dict(n_neighbors=list(range(1, 100)))
    else:  ## assume it's asking for neural network (multi-layer perceptron)
        model = MLPClassifier(
            max_iter=100
        )  #activation=tanh, hiddenlayersize=(20,20), 'learning_rate'=adaptive,solver=lbfgs
        hyperparameter = {
            'hidden_layer_sizes': [(20, 20), (80, 20), (80, 20, 20),
                                   (80, 40, 40, 20), (40, 40, 20, 20, 20, 10)],
            'learning_rate': ['adaptive'],
            'activation': ['tanh', 'relu', 'logistic'],
            'solver': ['lbfgs', 'sgd', 'adam']
        }
    tuned_model = GridSearch(model=model, param_grid=hyperparameter)
    tuned_model.fit(train, truth)
    prediction = tuned_model.score(test, test_truth)
    logger.warn(classifier + ' ' + datatype + ' validate    ' +
                str(prediction))
    tuned_model.fit(train, truth, validate, validate_truth)
    prediction = tuned_model.score(test, test_truth)
    target_names = [
        'c-CS-s', 'c-CS-m', 'c-SC-s', 'c-SC-m', 't-CS-s', 't-CS-m', 't-SC-s',
        't-SC-m'
    ]
    prediction = tuned_model.predict(test)
    print(
        classification_report(test_truth,
                              prediction,
                              target_names=target_names))
    logger.warn(classifier + ' ' + datatype + '    ' + str(prediction))
    return
class MachineLearning:
    """Machine learning class to run sklearn-like pipeline on MethylationArray data.
    Initialize object with scikit-learn model, and optionally supply a hyperparameter search grid.

    model
        Scikit-learn-like model, classification, regression, dimensionality reduction, clustering etc.
    options
        Options to supply model in form of dictionary.
    grid
        Alternatively, supply search grid to search for bets hyperparameters.
    labelencode
        T/F encode string labels.
    n_eval
        Number of evaluations for randomized grid search, if set to 0, perform exhaustive grid search
    """
    def __init__(self, model, options, grid={}, labelencode=False, n_eval=0):
        if grid:
            self.model = GridSearch(
                model=model(),
                param_grid=grid,
                num_random_search=None if not n_eval else n_eval)
            self.param_grid_exists = True
            self.grid = grid
        else:
            self.model = model(**options)
            self.param_grid_exists = False
        if labelencode:
            self.encoder = LabelEncoder()
        else:
            self.encoder = None

    def fit(self,
            train_methyl_array,
            val_methyl_array=None,
            outcome_cols=None):
        """Fit data to model.

        Parameters
        ----------
        train_methyl_array
            Training MethylationArray.
        val_methyl_array
            Validation MethylationArray. Can set to None.
        outcome_cols
            Set to none if not needed, but phenotype column to train on, can be multiple.
        """
        if outcome_cols != None:
            if self.encoder != None:
                self.encoder.fit(train_methyl_array.pheno[outcome_cols])
            if self.param_grid_exists:
                self.model.fit(
                    train_methyl_array.beta,
                    self.encoder.transform(
                        train_methyl_array.pheno[outcome_cols])
                    if self.encoder != None else
                    train_methyl_array.pheno[outcome_cols],
                    val_methyl_array.beta,
                    self.encoder.transform(
                        val_methyl_array.pheno[outcome_cols])
                    if self.encoder != None else
                    val_methyl_array.pheno[outcome_cols],
                    scoring='accuracy' if self.encoder != None else 'r2')
            else:
                self.model.fit(
                    train_methyl_array.beta,
                    self.encoder.transform(
                        train_methyl_array.pheno[outcome_cols])
                    if self.encoder != None else
                    train_methyl_array.pheno[outcome_cols])
        else:
            self.model.fit(train_methyl_array.beta)
        return self.model

    def transform(self, test_methyl_array):
        """Transform test methylation array.

        Parameters
        ----------
        test_methyl_array
            Testing MethylationArray.
        """
        self.results = self.model.transform(test_methyl_array.beta)
        return self.results

    def fit_transform(self, train_methyl_array, outcome_cols=None):
        """Fit and transform to training data.

        Parameters
        ----------
        train_methyl_array
            Training MethylationArray.
        outcome_cols
            Set to none if not needed, but phenotype column to train on, can be multiple.
        """
        self.results = self.fit(
            train_methyl_array,
            outcome_cols=None).transform(train_methyl_array)
        return self.results

    def predict(self, test_methyl_array):
        """Make new predictions on test methylation array.

        Parameters
        ----------
        test_methyl_array
            Testing MethylationArray.
        """
        self.results = self.model.predict(test_methyl_array.beta)
        if self.encoder != None:
            self.results = self.encoder.inverse_transform(self.results)
        return self.results

    def fit_predict(self, train_methyl_array, outcome_cols=None):
        """Fit and predict training data.

        Parameters
        ----------
        train_methyl_array
            Training MethylationArray.
        outcome_cols
            Set to none if not needed, but phenotype column to train on, can be multiple.
        """
        self.results = self.fit(train_methyl_array,
                                outcome_cols).predict(train_methyl_array)
        return self.results

    def store_results(self, output_pkl, results_dict={}):
        """Store results in pickle file.

        Parameters
        ----------
        output_pkl
            Output pickle to dump results to.
        results_dict
            Supply own results dict to be dumped.
        """
        if not results_dict:
            results_dict = dict(results=self.results)
        pickle.dump(results_dict, open(results_dict, 'wb'))

    def assign_results_to_pheno_col(self, methyl_array, new_col, output_pkl):
        """Assign results to new phenotype column.

        Parameters
        ----------
        methyl_array
            MethylationArray.
        new_col
            New column name.
        output_pkl
            Output pickle to dump MethylationArray to.
        """
        methyl_array.pheno[new_col] = self.results
        methyl_array.write_pickle(output_pkl)

    def transform_results_to_beta(self, methyl_array, output_pkl):
        """Transform beta matrix into reduced beta matrix and store.

        Parameters
        ----------
        methyl_array
            MethylationArray.
        output_pkl
            Output pickle to dump MethylationArray to.
        """
        methyl_array.beta = pd.DataFrame(self.results, index=self.beta.index)
        methyl_array.write_pickle(output_pkl)

    def return_outcome_metric(self,
                              methyl_array,
                              outcome_cols,
                              metric,
                              run_bootstrap=False):
        """Supply metric to evaluate results.

        Parameters
        ----------
        methyl_array
            MethylationArray to evaluate.
        outcome_cols
            Outcome phenotype columns.
        metric
            Sklearn evaluation metric.
        run_bootstrap
            Make 95% CI from 1k bootstraps.
        """
        y_true = methyl_array.pheno[outcome_cols]
        y_pred = self.results
        if not bootstrap:
            return metric(y_true, y_pred)
        else:
            from sklearn.utils import resample
            boot_results = np.array([
                metric(*resample(y_true, y_pred, random_state=123))
                for i in range(n_bootstrap)
            ])
            original = metric(y_true, y_pred)
            std_err = np.std(boot_results)
            boot_results = np.sort(boot_results)
            ci = 0.95
            bound = (1 - ci) / 2.

            # BORROWED FROM MLXTEND
            def quantile(x, q):
                rank = round(q * x.shape[0]) - 1
                if rank >= x.shape[0]:
                    rank = x.shape[0]
                elif rank <= 0:
                    rank = 0
                rank = int(round(rank))
                return x[rank]

            high_ci = quantile(boot_results, q=(ci + bound))
            low_ci = quantile(boot_results, q=bound)
            return original, std_err, (low_ci, high_ci)
    #'learning_rate':[0.0001,0.001,0.01,0.1],
    #'eval_set': [[(X_test,y_test)]],
    'gamma': [0, 0.0001, 0.001, 0.01, 0.1],
    'eval_metric': ['rmse'],
    'verbose': [True],
    'silent': [False],
    'min_child_weight': list(np.arange(1, X_test.shape[1], 5)),
    'n_estimators': [10, 100, 200, 300, 1000]
}]

xg_reg = xgb.XGBRegressor()

# Applying Grid Search to find the best model and the best parameters
from hypopt import GridSearch
from sklearn.model_selection import GridSearchCV
grid_search = GridSearch(model=xg_reg, param_grid=parameters)

grid_search = grid_search.fit(X_train,
                              y_train,
                              X_val=X_test,
                              y_val=y_test,
                              scoring='neg_mean_squared_error')
best_parameters = grid_search.get_params()

#best_mse = (-grid_search.best_score_)**(1/2)
#best_parameters = grid_search.best_params_
'''best_parameters['n_estimators'] = 10000
best_parameters['gamma'] = 0
best_parameters['min_child_weight'] = 2'''

xg_reg = xgb.XGBRegressor(**best_parameters)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--vectors_file', required=True, type=str)
    parser.add_argument('--data_path',
                        default='stanfordSentimentTreebank',
                        type=str)
    parser.add_argument('--output_file', required=True, type=str)
    args = parser.parse_args()

    try:
        vectors
    except NameError:
        print('Reading vectors file ...  ', end='')
        t = time.time()
        with codecs.open(args.vectors_file, 'r', "UTF-8") as f:
            vocab_size = sum(1 for line in f)

        with codecs.open(args.vectors_file, 'r', "UTF-8") as f:
            line = f.readline()
            val = line.rstrip().split(' ')
            check = False
            if len(
                    val
            ) == 2:  # Check if the vectors file has vocab size and diensionality in the first line
                val = f.readline().rstrip().split(' ')
                vocab_size -= 1
                check = True
            vector_dim = len(list(map(float, val[1:])))

        vectors = np.zeros((vocab_size, vector_dim))

        words = [""] * vocab_size
        vocab_dict = dict()
        with codecs.open(args.vectors_file, 'r', "UTF-8") as f:
            if check:
                next(f)
            for idx, line in enumerate(f):
                vals = line.rstrip().split(' ')

                words[idx] = vals[0]
                vocab_dict[vals[0]] = idx  # indices start from 0
                vec = list(map(float, vals[1:]))
                try:
                    vectors[idx, :] = vec
                except IndexError:
                    if vals[0] == '<unk>':  # ignore the <unk> vector
                        pass
                    else:
                        raise Exception('IncompatibleInputs')

        print("done in " + str(int(time.time() - t)) + " seconds")

    print('Reading train and test data ...  ', end='')
    t = time.time()
    dictionary = dict()
    with codecs.open(args.data_path + "/dictionary.txt", 'r', "UTF-8") as f:
        for line in f.read().splitlines():
            tmp = line.split("|")
            dictionary[tmp[0]] = int(tmp[1])

    with codecs.open(args.data_path + "/datasetSentences.txt", "r",
                     "UTF-8") as f:
        sentences = []
        for sentence in f.read().splitlines()[1:]:
            sentences.append(sentence.split("\t")[1])

    all_labels = []
    with open(args.data_path + "/sentiment_labels.txt") as f:
        for label in f.read().splitlines()[1:]:
            all_labels.append(float(label.split("|")[1]))

    split_classes = []
    with open(args.data_path + "/datasetSplit.txt") as f:
        for line in f.read().splitlines()[1:]:
            split_classes.append(int(line.split(",")[1]))

    print("done in " + str(int(time.time() - t)) + " seconds")

    print(
        'Generating train and test samples from the data for selected classes ...  ',
        end='')
    t = time.time()

    train_size = sum([1 for label in split_classes if label == 1])
    val_size = sum([1 for label in split_classes if label == 3])
    test_size = sum([1 for label in split_classes if label == 2])

    train_samples = np.zeros([train_size, vector_dim])
    train_labels = []

    val_samples = np.zeros([val_size, vector_dim])
    val_labels = []

    test_samples = np.zeros([test_size, vector_dim])
    test_labels = []

    train_no = 0
    val_no = 0
    test_no = 0
    not_in_dict_count = 0
    for sample_no, sentence in enumerate(sentences):
        try:
            score = all_labels[dictionary[sentence]]
        except:
            not_in_dict_count += 1
            continue

        if score <= 0.4 or score > 0.6:  # Eliminate noutral sentences
            inds = process_sentence(sentence, vocab_dict)
            if len(inds) > 0:
                if split_classes[sample_no] == 1:
                    for ind in inds:
                        train_samples[train_no, :] += vectors[ind, :]

                    train_samples[
                        train_no, :] = train_samples[train_no, :] / len(inds)

                    if score <= 0.4:
                        train_labels.append(0)
                    elif score > 0.6:
                        train_labels.append(1)

                    train_no += 1

                elif split_classes[sample_no] == 3:
                    for ind in inds:
                        val_samples[val_no, :] += vectors[ind, :]

                    val_samples[val_no, :] = val_samples[val_no, :] / len(inds)

                    if score <= 0.4:
                        val_labels.append(0)
                    elif score > 0.6:
                        val_labels.append(1)

                    val_no += 1

                elif split_classes[sample_no] == 2:
                    for ind in inds:
                        test_samples[test_no, :] += vectors[ind, :]

                    test_samples[
                        test_no, :] = test_samples[test_no, :] / len(inds)

                    if score <= 0.4:
                        test_labels.append(0)
                    elif score > 0.6:
                        test_labels.append(1)

                    test_no += 1

    train_samples = train_samples[:train_no, :]
    val_samples = val_samples[:val_no, :]
    test_samples = test_samples[:test_no, :]

    print("done in " + str(int(time.time() - t)) + " seconds")

    print('Training linear SVM for parameter optimization ... ', end='')

    tuned_parameters = [{
        'kernel': ['linear'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    }]
    clf = GridSearch(model=SVC(), param_grid=tuned_parameters)
    clf.fit(train_samples, train_labels, val_samples, val_labels)

    print("done in " + str(int(time.time() - t)) + " seconds")

    predicted_labels = clf.predict(test_samples)
    accuracy = sum([
        true == predicted
        for true, predicted in zip(test_labels, predicted_labels)
    ]) / len(test_samples) * 100

    print("Accuracy for sentiment classification of sentences is: " +
          str(round(accuracy, 2)) + "% (" +
          str(int(accuracy / 100 * len(predicted_labels))) + "/" +
          str(len(predicted_labels)) + ")")

    f_out = open(args.output_file, "w")
    f_out.write("Accuracy for sentiment classification is: " +
                str(round(accuracy, 2)) + "% (" +
                str(int(accuracy / 100 * len(predicted_labels))) + "/" +
                str(len(predicted_labels)) + ")\n")
    f_out.close()
X_train, X_test, Y_train, Y_test = train_test_split(new, df[is_duplicate], test_size=0.06, stratify=df[is_duplicate])
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.06, stratify=Y_train)

print("Train dataset size: ", X_train.shape[0])
print("Validation dataset size: ", X_val.shape[0])
print("Test dataset size: ", X_test.shape[0])

del new


# ### Logistic Regression
parameters = [{"penalty": ["l1", "l2"], 'C': [0, 3, 0.5, 0.6]}]

clf = LogisticRegression()
if not os.path.exists('final_logistic.pkl'):
    clf = GridSearch(model=clf)
    clf.fit(X_train, Y_train, parameters, X_val, Y_val)
else:
    clf = pickle.load(open('final_logistic.pkl', 'rb'))
print(clf)

pred = clf.predict(X_test)
scores = clf.predict_proba(X_test)[:, 1]

test_acc = clf.score(X_test, Y_test)
train_acc = clf.score(X_train, Y_train)
val_acc = clf.score(X_val, Y_val)

fpr, tpr, thresholds = roc_curve(Y_test, scores, pos_label=1)

name = "Logistic Regression"
    def fit(self, X, Y, X_val, Y_val):
        param_grid = []

        val_mode = False
        if X_val is not None and Y_val is not None:
            val_mode = True
            if isinstance(self.base_estimator, sklearn.svm.SVC):
                self.base_estimator.set_params(probability=True)
                param_grid = [
                    {
                        'C': [1, 10, 100],
                        'kernel': ['linear']
                    },
                    {
                        'C': [1, 10, 100],
                        'gamma': [0.001, 0.0001],
                        'kernel': ['rbf']
                    },
                ]
            elif isinstance(self.base_estimator,
                            sklearn.linear_model.LogisticRegression):
                param_grid = [{
                    'solver': ['liblinear'],
                    'penalty': ['l1', 'l2'],
                    'C': [0.0001, 0.001, 0.01, 1, 100]
                }]
        else:
            print('not in validation mode')
            if isinstance(self.base_estimator, sklearn.svm.SVC):
                self.base_estimator.set_params(gamma='auto')
                self.base_estimator.set_params(kernel='linear')
            elif isinstance(self.base_estimator,
                            sklearn.linear_model.LogisticRegression):
                self.base_estimator.set_params(solver='liblinear')
        """Fit the model to data matrix X and targets Y.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The input data.
        Y : array-like, shape (n_samples, n_classes)
            The target values.

        Returns
        -------
        self : object
            Returns self.
        """
        X, Y = check_X_y(X, Y, multi_output=True, accept_sparse=True)

        random_state = check_random_state(self.random_state)

        check_array(X, accept_sparse=True)
        self.order_ = self.order
        if self.order_ is None:
            self.order_ = np.array(range(Y.shape[1]))
        elif isinstance(self.order_, str):
            if self.order_ == 'random':
                self.order_ = random_state.permutation(Y.shape[1])
        elif sorted(self.order_) != list(range(Y.shape[1])):
            raise ValueError("invalid order")

        self.estimators_ = [
            clone(self.base_estimator) for _ in range(Y.shape[1])
        ]

        self.classes_ = []

        if self.cv is None:
            Y_pred_chain = sp.lil_matrix((X.shape[0], Y.shape[1]))
            X_aug = sp.hstack((X, Y_pred_chain), format='lil')

            if val_mode:
                Y_pred_chain_val = sp.lil_matrix((X_val.shape[0], Y.shape[1]))
                X_val_aug = sp.hstack((X_val, Y_pred_chain_val.copy()),
                                      format='lil')

        elif sp.issparse(X):
            Y_pred_chain = sp.lil_matrix((X.shape[0], Y.shape[1]))
            X_aug = sp.hstack((X, Y_pred_chain), format='lil')

            if val_mode:
                Y_pred_chain_val = sp.lil_matrix((X_val.shape[0], Y.shape[1]))
                X_val_aug = sp.hstack((X_val, Y_pred_chain_val.copy()),
                                      format='lil')

        else:
            Y_pred_chain = np.zeros((X.shape[0], Y.shape[1]))
            X_aug = np.hstack((X, Y_pred_chain))

            if val_mode:
                Y_pred_chain_val = np.zeros((X_val.shape[0], Y.shape[1]))
                X_val_aug = sp.hstack((X_val, Y_pred_chain_val.copy()))

        del Y_pred_chain

        if val_mode:
            del Y_pred_chain_val

        class_1 = 1
        class_2 = 0
        if -1 in Y:
            class_2 = -1

        for chain_idx, estimator in enumerate(self.estimators_):
            y = Y[:, self.order_[chain_idx]]

            # class_1_counter = np.count_nonzero(y[:, 0] == class_1)
            # class_2_counter = np.count_nonzero(y[:, 0] == class_2)
            class_1_counter = y.flatten().tolist().count(class_1)
            class_2_counter = y.flatten().tolist().count(class_2)

            if class_1_counter <= class_2_counter:
                minority_index = 1
                minority_counter = class_1_counter
                majority_index = 0
            else:
                minority_index = 0
                minority_counter = class_2_counter
                majority_index = 1

            # get all the minority samples
            sampled_index = [
                index for index, label in enumerate(y)
                if label == minority_index
            ]
            sampled_y = [minority_index] * minority_counter
            #print('m'+str(len(sampled_y))+' '+str(minority_index)+'s')

            sampled_index.extend(sampled_index)
            sampled_y.extend(sampled_y)

            # sample the majority samples
            temp_sampled_index = [
                index for index, label in enumerate(y)
                if label == majority_index
            ]
            #print(str(len(temp_sampled_index))+' '+str(majority_index)+'s')
            sampled_index.extend(
                random.sample(temp_sampled_index, minority_counter))
            sampled_y.extend([majority_index] * minority_counter)

            print(
                str(chain_idx) + '___' + str(self.order_[chain_idx]) +
                ') training label:' + str(chain_idx) + ' with ' +
                str(len(sampled_y)) + ' instances ')
            #print('X_aug[np.array(sampled_index), :(X.shape[1] + chain_idx)]: '+str(X_aug[np.array(sampled_index), :(X.shape[1] + chain_idx)].shape))
            #print('np.array(sampled_y): '+str(np.array(sampled_y).shape))

            if val_mode:
                # for the grid search version
                gs = GridSearch(model=estimator, param_grid=param_grid)
                temp_estimator = gs.fit(
                    X_aug[np.array(sampled_index), :(X.shape[1] + chain_idx)],
                    np.array(sampled_y),
                    X_val_aug[:, :(X.shape[1] + chain_idx)],
                    Y_val[:, self.order_[chain_idx]],
                    scoring='roc_auc')
            else:
                estimator.fit(
                    X_aug[np.array(sampled_index), :(X.shape[1] + chain_idx)],
                    np.array(sampled_y))

            if chain_idx < len(self.estimators_) - 1:
                # produce the predictions and add them as features for the next classifier in the chain
                col_idx = X.shape[1] + chain_idx
                # use all the available features(from X_train and all the predictions thus far)

                if val_mode:
                    previous_predictions_val = temp_estimator.predict(
                        X_val_aug[:, :col_idx])
                    previous_predictions = temp_estimator.predict(
                        X_aug[:, :col_idx])
                else:
                    previous_predictions = estimator.predict(
                        X_aug[:, :col_idx])

                # insert the predictions as features to be used for the next classifier in the chain
                if sp.issparse(X_aug):
                    X_aug[:, col_idx] = np.expand_dims(previous_predictions, 1)
                    if val_mode:
                        X_val_aug[:, col_idx] = np.expand_dims(
                            previous_predictions_val, 1)
                else:
                    X_aug[:, col_idx] = previous_predictions
                    if val_mode:
                        X_val_aug[:, col_idx] = previous_predictions_val

            # -------------------------------------------------------------------------------------------------------------------------------
            if self.cv is not None and chain_idx < len(self.estimators_) - 1:
                col_idx = X.shape[1] + chain_idx
                cv_result = cross_val_predict(self.base_estimator,
                                              X_aug[:, :col_idx],
                                              y=y,
                                              cv=self.cv)
                if sp.issparse(X_aug):
                    X_aug[:, col_idx] = np.expand_dims(cv_result, 1)
                else:
                    X_aug[:, col_idx] = cv_result

            if val_mode:
                self.classes_.append(temp_estimator.classes_)
                self.estimators_[chain_idx] = temp_estimator
            else:
                self.classes_.append(estimator.classes_)

        return self
Esempio n. 21
0
def test_regression(
    model=SVR(),
    return_model=False,
    param_grid=None,
    gs_score=.4532,
    assertions=True,
    scoring=None,
    verbose=False,
):
    from sklearn.datasets import load_boston

    data = load_boston()

    # Create test and train sets from one dataset
    X_train, X_test, y_train, y_test = train_test_split(
        data["data"],
        data["target"],
        test_size=0.1,
        random_state=0,
    )

    # Create a validation set.
    X_train, X_val, y_train, y_val = train_test_split(
        X_train,
        y_train,
        test_size=0.1,
        random_state=0,
    )

    # List the parameters to search across
    if param_grid is None:
        param_grid = {
            'C': [1, 10, 100, 120, 150],
            'gamma': [0.001, 0.0001],
            'kernel': ['rbf'],
        }

    # Grid-search all parameter combinations using a validation set.
    gs = GridSearch(
        model=model,
        param_grid=param_grid,
    )
    gs.fit(
        X_train,
        y_train,
        X_val,
        y_val,
        scoring=scoring,
        verbose=True,
    )

    # Compare with default model without hyperopt
    default = model
    default.fit(X_train, y_train)

    default_score = round(default.score(X_test, y_test), 4)
    gridsearch_score = round(gs.score(X_test, y_test), 4)

    if verbose:
        print('Default score:', default_score, '| GridSearch Score:',
              gridsearch_score)

    if assertions:
        assert (default_score == .0175)
        assert (gridsearch_score is not None)

    if return_model:
        return gs