Esempio n. 1
0
 def get_reg_params(self, X, y):
     if self.Cs is None:
         self.Cs = l1_min_c(X, y, loss='log') * np.logspace(0, 7, self.n_C)
     ret = list()
     for c in self.Cs:
         ret.append(dict(C=c))
     return ret
Esempio n. 2
0
def compute_coefs():
    X, y, tX, ty = load_labled_point()
    X -= np.mean(X,0)

    cs = l1_min_c(X, y, loss='log') * np.logspace(0, 3)

    start = datetime.now()

    #solver : {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’}, default: ‘liblinear’    
    clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
    coefs_ = []
    for c in cs:
        clf.set_params(C=c)
        clf.fit(X, y)
        coefs_.append(clf.coef_.ravel().copy())
    print("This took ", datetime.now() - start)

    pred_y = clf.predict(tX)
    print('pred',' ','real')
    for i in range(pred_y.shape[0]):
        print(pred_y[i],' ',ty[i][0])

    coefs_ = np.array(coefs_)
    plt.plot(np.log10(cs), coefs_)
    ymin, ymax = plt.ylim()
    plt.xlabel('log(C)')
    plt.ylabel('Coefficients')
    plt.title('Logistic Regression Path')
    plt.axis('tight')
    plt.show()
Esempio n. 3
0
def lasso_selecting(train_feature, train_label, test_feature, test_label, alpha_base):
    cs = l1_min_c(train_feature, train_label, loss="log") * np.logspace(0, 3, 10)

    print("Computing regularization path ...")
    model_lasso = LogisticRegression(
        penalty="l1",
        solver="liblinear",
        tol=1e-6,
        max_iter=int(1e6),
        warm_start=True,
        intercept_scaling=10000.0,
    )
    for c in tqdm(cs):
        model_lasso.set_params(C=c)
        model_lasso.fit(train_feature, train_label)
        coef = model_lasso.coef_.ravel().copy()
        non_zero_feature = [
            (column, coef[i])
            for i, column in enumerate(train_feature.columns)
            if coef[i] != 0.0
        ]
        valid_pred = model_lasso.predict_proba(test_feature)[:, 1]
        valid_evaluation = evaluate(test_label, valid_pred)
        yield {
            "model": model_lasso,
            "log(C)": np.log10(c),
            "features": non_zero_feature,
            "score": valid_evaluation,
            "coef": coef,
        }
Esempio n. 4
0
    def test_l1_min_c(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.svm.l1_min_c()
        expected = svm.l1_min_c(iris.data, iris.target)
        self.assertAlmostEqual(result, expected)
Esempio n. 5
0
def tune_C_regularization_path(clf, features, labels, ind2label, num_steps=16, num_seeds=50):
    # select the top <num_seeds> seed words for each aspect
    clf.set_params(warm_start=True)
    min_c = 1  # 10^0
    max_c = 8  # 10^7
    # Regularization path
    # Return the lowest bound for C such that for C in (l1_min_C, infinity) the model is guaranteed not to be empty.
    # cs: a list of possible c values to try
    cs = l1_min_c(features, labels, loss='log') * np.logspace(min_c, max_c, num_steps)
    print("Computing regularization path ...")
    start = time()
    coefs_ = []
    for c in cs:
        clf.set_params(C=c)
        clf.fit(features, labels)
        coefs_.append(clf.coef_.copy())
        pos = np.sum(clf.coef_ > 0, axis=1)
        if len(ind2label) == 2:
            # binary classification
            flags = [pos[0] > num_seeds]
        else:
            # multiclass classification
            flags = [pos[i] > num_seeds for i in ind2label]
        if not False in flags:
            print("This took %0.3fs" % (time() - start))
            return c
    print("This took %0.3fs" % (time() - start))
    return c
Esempio n. 6
0
    def test_l1_min_c(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.svm.l1_min_c()
        expected = svm.l1_min_c(iris.data, iris.target)
        self.assertAlmostEqual(result, expected)
def test_L1_iris():
    # 具有 L1-逻辑回归的路径

    iris = datasets.load_iris()
    X = iris.data
    y = iris.target

    X = X[y != 2]
    y = y[y != 2]

    X -= np.mean(X, 0)

    cs = l1_min_c(X, y, loss='log') * np.logspace(0, 3)

    print("Computing regularization path ...")
    start = datetime.now()
    clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
    coefs_ = []
    for c in cs:
        clf.set_params(C=c)
        clf.fit(X, y)
        coefs_.append(clf.coef_.ravel().copy())
    print("This took ", datetime.now() - start)

    coefs_ = np.array(coefs_)
    plt.plot(np.log10(cs), coefs_)
    # ymin, ymax = plt.ylim()
    plt.xlabel('log(C)')
    plt.ylabel('Coefficients')
    plt.title('Logistic Regression Path')
    plt.axis('tight')
    plt.show()
def test():
    # 加载数据
    # 3类数据去掉第2类
    iris = datasets.load_iris()
    X = iris.data
    Y = iris.target
    X = X[Y != 2]
    Y = Y[Y != 2]
    # 减去均值,让相对差距更明显
    X -= np.mean(X, 0)

    #创建数据空间
    cs = l1_min_c(X, Y, loss='log') * np.logspace(0, 3)

    # 拟合数据
    clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)

    # 拟合路径
    coefs_ = []
    for c in cs:
        clf.set_params(C=c)
        clf.fit(X, Y)
        coefs_.append(clf.coef_.ravel().copy())
    coefs_ = np.array(coefs_)
    #print(coefs_)

    # 绘制路径
    plt.plot(np.log10(cs), coefs_)
    ymin, ymax = plt.ylim()
    plt.xlabel('log(C)')
    plt.ylabel('Coefficients')
    plt.title('Logistic Regression Path')
    plt.axis('tight')
    plt.show()
Esempio n. 9
0
    def test_regularization_path(self):
        # Check results using logistic path
        num_samples = 10
        num_feat = 5

        X, y = make_classification(n_samples=num_samples, n_features=num_feat, n_informative=3,
                                       n_classes=2, random_state=0, weights=[0.5, 0.5])
        matrix = np.zeros((num_samples, num_feat + 2))
        matrix[:,:-2] = X
        matrix[:, -2] = np.ones(num_samples)
        matrix[:, -1] = y

        # Betas to test
        logitfitL1 = LogisticRegressionL1()
        lambda_grid = np.exp(-1 * np.linspace(1, 17, 200))
        path = logitfitL1.fit(matrix, lambda_grid)

        # Sklearn
        cs = l1_min_c(X, y, loss='log') * np.logspace(0, 3)

        # Computing regularization path using sklearn
        clf = LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
        coefs_ = []
        for c in cs:
            clf.set_params(C=c)
            clf.fit(X, y)
            coefs_.append(clf.coef_.ravel().copy())

        skbetas = np.append(clf.intercept_[0], clf.coef_)
        np.testing.assert_almost_equal(skbetas, logitfitL1.coef_, 1)
Esempio n. 10
0
    def get_minimum_c (self, fname, asset_class, cols):
        """Function to obtain the minimum c parameter. any value smaller than this yields a model with 0 coefficients.
        """
        X_train, y_classifier = self.get_train_data(fname, asset_class, cols)
        #To determine the minimum C that gives a non 'null' model, only applicable when applying l1 penalty.
        min_c = l1_min_c(X_train[:-1], y_classifier[:-1], loss='log')

        return min_c
Esempio n. 11
0
    def get_minimum_c(self, cols, fname):
        """Function to obtain the minimum c parameter. any value smaller than this yields a model with 0 coefficients.
        """
        training_xdata, training_ydata = self.obtain_training_data(cols, fname)

        #To determine the minimum C that gives a non 'null' model, only applicable when applying l1 penalty.
        min_c = l1_min_c(training_xdata, training_ydata, loss='log')

        return min_c
Esempio n. 12
0
def l1_select(interpreted_model: bool,
              n_jobs: int,
              dataset: Tuple[pd.DataFrame, pd.Series],
              l1_base_step: int,
              l1_exp_step: float,
              early_stopping_rounds: Any,
              cv_split: Dict[int, Tuple[Sequence[int], Sequence[int]]],
              verbose=True,
              auc_tol: float = 1e-4) -> Tuple[f_list_type, Result]:

    # get grid for cs
    cs = l1_min_c(dataset[0], dataset[1],
                  loss='log', fit_intercept=True) * np.logspace(
                      0, l1_exp_step, l1_base_step)
    print('C parameter range in [{0}:{1}], {2} values'.format(
        cs[0], cs[-1], l1_base_step))
    # fit model with crossvalidation
    cv = PredefinedFolds(cv_split)
    clf = LogisticRegressionCV(Cs=cs,
                               solver='saga',
                               tol=1e-5,
                               cv=cv,
                               penalty='l1',
                               scoring=scorer,
                               intercept_scaling=10000.,
                               max_iter=1000,
                               n_jobs=n_jobs,
                               random_state=42)

    clf.fit(dataset[0].values, dataset[1].values)

    # print(clf.scores_[1].mean(axis=0))

    # analyze cv results
    result = analyze_result(clf, dataset[0].columns, interpreted_model)

    # perform selection
    # filter bad weights models
    scores_neg = [x for x in result if x.is_neg]
    # get top score from avail models
    max_score = max([x.score for x in result])
    # get score with tolerance
    ok_score = max_score - auc_tol
    # select first model that is ok with tolerance
    for res in scores_neg:
        if res.score >= ok_score:
            break

    # get selected features
    features_fit = [
        x for (x, y) in zip(dataset[0].columns, res.min_weights) if y != 0
    ]
    print(res)

    return features_fit, res
Esempio n. 13
0
def refit_reg(x_train: np.ndarray,
              y: np.ndarray,
              l1_grid_size: int,
              l1_exp_scale: float,
              max_penalty: float,
              interp: bool = True) -> Tuple[np.ndarray, float, np.ndarray]:
    """
    Final model refit with regularization

    Args:
        x_train:
        y:
        l1_grid_size:
        l1_exp_scale:
        max_penalty:
        interp:

    Returns:

    """
    clf = LogisticRegression(penalty='l1',
                             solver='saga',
                             warm_start=True,
                             intercept_scaling=100000)
    cs = l1_min_c(x_train, y, loss='log', fit_intercept=True) * np.logspace(
        0, l1_exp_scale, l1_grid_size)
    cs = cs[cs <= max_penalty]
    # add final penalty
    if cs[-1] < max_penalty:
        cs = list(cs)
        cs.append(max_penalty)

    # fit path
    weights, intercepts = [], []
    for c in cs:
        clf.set_params(C=c)
        clf.fit(x_train, y)
        weights.append(deepcopy(clf.coef_[0]))
        intercepts.append(clf.intercept_[0])

    if not interp:
        w, i = weights[-1], intercepts[-1]
        neg = w != 0
        return w[neg], i, neg

    for w, i in zip(weights[::-1], intercepts[::-1]):

        pos = (w > 0).sum()
        if pos > 0:
            continue

        neg = w < 0
        return w[neg], i, neg

    raise ValueError('No negative weights grid')
Esempio n. 14
0
def clfProcessor(name, pathForData, scoring_function):

    """
    Loads the data, scales the data, defines a grid for the hyperparameters
    for a l1-regularized logistic regression classifier, performs L1-based
    feature selection, and finds the best hyperparameters via cross validation.
    :param name: Name of data extrcation procedure, eg freq_2, abs, etc
    :param pathForData: Path for directory where thr data is
    :param scoring_function: Scoring function to optimize, eg f1, precision, etc.
    :return clf: Object with the optimal classifier
    :return X: Scaled dataset
    :return y: Labels
    :return X_max: Array with maximal values of X, to reproduce transform of the traning set
    :return X_min: Array with minimal values of X, to reproduce transform of the traning set
    """

    # Load training data. Each column is an observation

    fileMeasTrControl, fileMeasTrCA = getFileNames(name, pathForData)
    X, y = getXandY(fileMeasTrControl, fileMeasTrCA)

    # Scaling to [0,1] intervals and saving transform to apply in test data
    X_max = X.max(axis=0)
    X_min = X.min(axis=0)
    min_max_scaler = preprocessing.MinMaxScaler()
    X = min_max_scaler.fit_transform(X)

    # Shuffle data
    n_samples, n2 = X.shape
    order = np.random.permutation(n_samples)
    X = X[order, :]
    y = y[order].astype(np.float)

    # L1 based feature selection
    theTransform = linear_model.LogisticRegression(C=1, penalty='l1', dual=False, class_weight={1: 2})  # LinearSVC
    X = theTransform.fit_transform( X, y)

    # Find minimum C for non-empty model and get grid for cross validation
    cs_log = np.logspace( 0, 4, num=30)
    l1_min = l1_min_c( X, y, loss='log')
    cs = l1_min * cs_log[10:]
    grid = {'C': cs, 'class_weight': [{1: 1}, {1: 2}, {1: 3}, {1: 5}]}

    # Perform grid search cross validation
    kf = KFold( len(y), n_folds = 5)
    bestParameters = myCrossValidation( X, y, kf, scoring_function, grid)
    clf = linear_model.LogisticRegression( penalty = 'l1', **bestParameters)

    return [clf, X, y, X_max, X_min]
Esempio n. 15
0
def refit_reg(X: np.ndarray,
              y: np.ndarray,
              l1_base_step: int,
              l1_exp_step: float,
              max_penalty: float,
              interp: bool = True) -> Tuple[np.ndarray, float, np.ndarray]:

    clf = LogisticRegression(penalty='l1',
                             solver='saga',
                             warm_start=True,
                             intercept_scaling=100000)
    cs = l1_min_c(X, y, loss='log', fit_intercept=True) * np.logspace(
        0, l1_exp_step, l1_base_step)
    cs = cs[cs <= max_penalty]
    # add final penalty
    if cs[-1] < max_penalty:
        cs = list(cs)
        cs.append(max_penalty)

    # fit path
    weights, intercepts = [], []
    for c in cs:
        clf.set_params(C=c)
        clf.fit(X, y)
        weights.append(deepcopy(clf.coef_[0]))
        intercepts.append(clf.intercept_[0])

    if not interp:
        w, i = weights[-1], intercepts[-1]
        neg = w != 0
        return w[neg], i, neg

    for w, i in zip(weights[::-1], intercepts[::-1]):

        pos = (w > 0).sum()
        if pos > 0:
            continue

        neg = w < 0
        return w[neg], i, neg
    # заглушка, если уж херня какая-то получилась - верни что есть
    # return w[neg], i, neg
    raise ValueError('No negative weights grid')
Esempio n. 16
0
def logistic_regression(model_df, response, folds):
    logger = logging.getLogger('log')
    logger.info('dataset shape: {}'.format(model_df.shape))

    response = response[model_df.index.intersection(response.index)]
    min_c = l1_min_c(model_df, response, loss='log')
    tuned_parameters = {'C': np.log10(np.logspace(min_c, min_c * 5000, 50))}

    clf = GridSearchCV(LogisticRegression(penalty='l1', random_state=100),
                       tuned_parameters,
                       cv=folds,
                       scoring=('neg_log_loss', 'average_precision'),
                       return_train_score=True,
                       refit='average_precision')
    clf.fit(model_df, response)

    logger.info('CV average precision: {}'.format(clf.best_score_))
    logger.info('best param: {}'.format(clf.best_params_))
    # make sure that best index isn't on the edges of the grid
    logger.debug('best param index: {}'.format(clf.best_index_))

    logger.debug('mean train score:\n{}'.format(
        clf.cv_results_['mean_train_average_precision']))
    logger.debug('mean test score:\n{}'.format(
        clf.cv_results_['mean_test_average_precision']))

    coefs = pd.DataFrame(list(
        zip(model_df.columns, clf.best_estimator_.coef_[0])),
                         columns=['app', 'coef'])
    logger.info('train features after regularization: {}'.format(
        (coefs['coef'] != 0).sum()))
    logger.debug('coefficients:\n{}'.format(
        coefs[coefs['coef'] != 0].sort_values(
            'coef', ascending=False).to_string(index=False)))
    logger.debug('intercept: {}'.format(clf.best_estimator_.intercept_[0]))
    logger.info('train average precision: {}'.format(
        average_precision_score(
            response,
            clf.best_estimator_.predict_proba(model_df)[:, 1])))

    return clf.best_estimator_
Esempio n. 17
0
 def fit(self, X, y):
     cs = np.concatenate([[1e6],
                          l1_min_c(X, y, loss='log') *
                          np.logspace(3, 0, num=self.K - 1)])
     clf = LogisticRegression(C=1.0,
                              penalty='l1',
                              tol=1e-6,
                              solver='liblinear',
                              multi_class='ovr')
     self.coef_path_ = []
     self.intercepts_ = []
     self.alphas = []
     n = self.K
     for c in cs:
         n -= 1
         clf.set_params(C=c)
         clf.fit(X, y)
         if self.classes_ is None:
             self.classes_ = clf.classes_
         coef = clf.coef_
         intercept = clf.intercept_
         if self.coef_ is None and (self.max_var <= 0 or np.sum(
                 np.sum(np.abs(coef) > 1e-4, axis=0) > 0) <= self.max_var):
             self.coef_ = coef
             self.intercept_ = intercept
             self.current_index = n
         self.coef_path_.append(coef.copy())
         self.intercepts_.append(intercept)
         self.alphas.append(1.0 / c)
     if self.coef_ is None:  # shouldn't happen but ya never know
         self.coef_ = clf.coef_
         self.intercept_ = clf.intercept_
     self.coef_path_ = list(reversed(self.coef_path_))
     self.intercepts_ = list(reversed(self.intercepts_))
     self.alphas = list(reversed(self.alphas))
     return self
Esempio n. 18
0
def pen_logi_reg(covariates, response, penalty='l1', xlabels=test_keywords):
    clf = LogisticRegression(penalty='l1',
                             solver='saga',
                             tol=1e-6,
                             max_iter=int(1e6),
                             warm_start=True,
                             fit_intercept=True)
    cvAcc = list()
    coefs_ = list()
    cs = l1_min_c(covariates, response, loss='log') * np.logspace(0, 2, 16)
    for c in cs:
        clf.set_params(C=c)
        # clf.fit(X, y)
        # coefs_.append(clf.coef_.ravel().copy())
        scores = cross_val_score(clf, covariates, response, cv=5)
        cvAcc.append(np.mean(scores))
        # print(cvAcc)
    cvAcc = np.array(cvAcc)
    min = np.amax(cvAcc)
    pos = np.where(cvAcc == min)

    clf.set_params(C=cs.item(pos[0].item(0)))
    clf.fit(covariates, response)
    coefs_ = clf.coef_.ravel().copy()
    print('Model coefficients: ')
    print(coefs_)
    # plt.xticks(np.arange(len(coefs_)),xlabels)
    plt.plot(np.arange(len(coefs_)), coefs_)
    plt.title('Model coefficients')
    plt.xlabel('Keywords')
    plt.ylabel('Coefficients')
    # plt.plot(np.log10(cs), cvAcc, marker='o')
    # plt.xlabel(test_keywords)
    print(np.where(np.abs(coefs_) >= 1e-4))
    plt.show()
    return clf, cvAcc
Esempio n. 19
0
    def fit(self, L1=True, cs=None):
        """
        Use scikit-learn's LogisticRegression model to fit the data

        :param L1:  If True, use L1 penalty on the coefficients
        """
        from sklearn.linear_model import LogisticRegression
        from sklearn.svm import l1_min_c

        F = np.vstack([d["F"] for d in self.data_list])
        S = np.vstack([d["S"] for d in self.data_list])

        # Hold out some data for cross validation
        offset = int(0.75 * S.shape[0])
        T_xv = S.shape[0] - offset
        F_xv = F[offset:, ...]
        S_xv = S[offset:, ...]
        augmented_xv_data = {"T": T_xv, "S": S_xv, "F": F_xv}

        F    = F[:offset, ...]
        S    = S[:offset, ...]

        # Get regularization path for inverse penalty C
        if cs is None:
            if L1:
                cs = l1_min_c(F, S[:,0], loss='log') * np.logspace(1, 6., 10)
            else:
                cs = np.logspace(-5,1,10)
            # cs = sigmas
        # The intercept is also subject to penalization, even though
        # we don't really want to penalize it. To counteract this effect,
        # we scale the intercept by a large value
        intercept_scaling = 1

        penalty = "l1" if L1 else "l2"

        for n_post in xrange(self.N):
            print "Computing regularization path for neuron %d ..." % n_post
            ints      = []
            coeffs    = []
            xv_scores = []
            lr = LogisticRegression(C=1.0, penalty=penalty,
                                    fit_intercept=True, intercept_scaling=intercept_scaling,
                                    tol=1e-6)
            for c in cs:
                print "Fitting for C=%.5f" % c
                lr.set_params(C=c)
                lr.fit(F, S[:,n_post])
                ints.append(lr.intercept_.copy())
                coeffs.append(lr.coef_.ravel().copy())
                # xv_scores.append(lr.score(F_xv, S_xv[:,n_post]).copy())

                # Temporarily set the weights and bias
                self.b[n_post] = lr.intercept_
                self.weights[n_post, :] = lr.coef_
                xv_scores.append(self.heldout_log_likelihood(augmented_data=augmented_xv_data))

            # Choose the regularization penalty with cross validation
            print "XV Scores: "
            for c,score  in zip(cs, xv_scores):
                print "\tc: %.5f\tscore: %.1f" % (c,score)
            best = np.argmax(xv_scores)
            print "Best c: ", cs[best]

            # Save the best weights
            self.b[n_post]          = ints[best]
            self.weights[n_post, :] = coeffs[best]

            print " Max w: ", self.weights[n_post].max(), \
                  " Min w: ", self.weights[n_post].min()

            assert abs(self.weights[n_post]).max() > 1e-6

        print ""
Esempio n. 20
0
import load_data_ext as ld_ext
import load_data_mi_ext_new as ldmi_ext

np.random.seed(10)

############################################################################

# Quick logistic regression with lasso penalty, chosen with cross validation
# website used for much code:

# Initial random model
mod1 = LogisticRegression(C=0.5, penalty='l1')

# Smallest value of C before all coefficients set to zero
min_l1_C = l1_min_c(ld_ext.train1.ix[:, 0:229], ld_ext.train1.ix[:, 229])
'%f' % min_l1_C  # 0.000028 ~= 0.00003

#create candidate values of C
c_vals = min_l1_C * np.logspace(0, 4, 15)

# Create a dictionary whose keys are the candidate values of C.
# The dictionary will hold the error rates in each CV trial for that
# value of C.
cdict = {}
for c in c_vals:
    cdict[c] = []

# Cross validation to choose c. train1 and test1 already have randomized rows from train_test_split

# Genaerate indicies to split data into 50 chunks
Esempio n. 21
0
    def fit(self,xw,xwl2,y,gs=4,model_type='logit',verbose=True):
        self.verbose = verbose
        if model_type=='logit':
            clf = LogisticRegression(C=1,class_weight='balanced',penalty='l2',max_iter=300)
        else:
            #clf = SVC(kernel='linear', class_weight='balanced', C=.1,probability=False)
            clf = SVC(C=1.,cache_size=500,kernel='linear',class_weight='balanced',probability=False)
        '''
        # wrapper feature selection
        rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(y, 3), scoring='f1')#accuracy
        rfecv.fit(xw, y)
        print("Optimal number of features : %d" % rfecv.n_features_)
        print("ids: {}".format((rfecv.ranking_<=5).sum()))
        print rfecv.grid_scores_
        self.rfecv = rfecv
        if rfecv.support_.sum()>10:
            self.w_select = rfecv.support_
        else:
            self.w_select = rfecv.ranking_<=10
        '''
        #xw = [:,self.w_select]

        #self.mask_selection = (np.ones((1,xw.shape[1]))==1)[0,:]
        ## Optimize the hyper parameters
        # Stage 1
        #param_grid = dict(C=(np.array([5,3,1])))
        if model_type=='logit':
            param_grid = dict(C=(10**np.arange(1.,-2.,-0.5)))
            #param_grid = dict(C=(np.logspace(-.2, 1., 15)))
            #param_grid = dict(C=(np.arange(3,1,-0.5)))
        else:
            param_grid = dict(C=(np.arange(3.5,0.,-0.5)))
            param_grid = dict(C=(1.,1.00001))
            #param_grid = dict(C=(np.logspace(-1.5, 0, 10)))
            #param_grid = dict(C=(np.arange(2.,0.5,-0.05)))
            #param_grid = dict(C=(np.array([0.01, 0.1, 1, 10, 100, 1000])))

        gridclf = GridSearchCV(clf, param_grid=param_grid, cv=StratifiedKFold(y,n_folds=gs), n_jobs=-1,scoring='accuracy')
        gridclf.fit(xw,y)
        self.clf1 = gridclf.best_estimator_
        if self.verbose:
            print self.clf1
            print self.clf1.coef_
        #hm_y,y_pred_train = self.estimate_hitmiss(xw,y)
        hm_y,proba = self.suffle_hm(xw,y,gamma=.9,n_iter=100)

        print 'Stage 2'
        #Stage 2
        min_c = l1_min_c(xwl2,hm_y,loss='log')
        print 'minimum c: ',min_c
        #clf2 = LogisticRegression(C=10**0.1,class_weight=None,penalty='l2',solver='sag')
        #clf2 = LogisticRegression(C=1,class_weight=None,penalty='l2',solver='sag',max_iter=300)
        #clf2 = LinearSVC(class_weight='balanced',penalty='l1',dual=False)
        clf2 = LogisticRegression(C=1.,class_weight='balanced',penalty='l1',solver='liblinear',max_iter=300)
        #clf2 = LogisticRegression(C=1.,class_weight='balanced',penalty='l2',solver='sag',max_iter=300)
        #clf2 = SVC(C=1.,cache_size=500,kernel='linear',class_weight='balanced')
        #param_grid = dict(C=(10**np.arange(1.,-2.,-0.5)))
        #param_grid = dict(C=(np.arange(3,1,-0.5)))
        #param_grid = dict(C=(np.logspace(-0.5, 2., 30)))
        #param_grid = dict(C=(np.logspace(1., 1.6, 30)))
        param_grid = dict(C=(np.logspace(-.2, 1., 15)))
        #param_grid = dict(C=(np.logspace(np.log10(min_c), 0., 15)))
        #param_grid = dict(C=(1,1.0001)) 
        # 2 levels balancing
        '''
        new_classes = np.zeros_like(y)
        new_classes[(y==0) & (hm_y==0)]=0
        new_classes[(y==1) & (hm_y==0)]=1
        new_classes[(y==0) & (hm_y==1)]=2
        new_classes[(y==1) & (hm_y==1)]=3

        tmp_samp_w = len(new_classes) / (len(np.unique(new_classes))*1. * np.bincount(new_classes))
        tmp_samp_w = (1.*(tmp_samp_w/tmp_samp_w.sum()))
        sample_w = new_classes.copy().astype(float)
        sample_w[new_classes==0] = tmp_samp_w[0]
        sample_w[new_classes==1] = tmp_samp_w[1]
        sample_w[new_classes==2] = tmp_samp_w[2]
        sample_w[new_classes==3] = tmp_samp_w[3]
        '''
        #gridclf = GridSearchCV(clf2, param_grid=param_grid, cv=StratifiedKFold(hm_y,n_folds=gs),fit_params=dict(sample_weight=sample_w), n_jobs=-1,scoring='accuracy')
        #gridclf = GridSearchCV(clf2, param_grid=param_grid, cv=StratifiedKFold(hm_y,n_folds=gs),fit_params=dict(sample_weight=proba), n_jobs=-1,scoring='accuracy')
        gridclf = GridSearchCV(clf2, param_grid=param_grid, cv=StratifiedKFold(hm_y,n_folds=gs), n_jobs=-1,scoring='accuracy')
        gridclf.fit(xwl2,hm_y)
        clf2 = gridclf.best_estimator_
        #clf2.fit(xw[train_index,:][:,idx_sz],hm_y)
        if self.verbose:
            print clf2
            print clf2.coef_

        self.clf2 = clf2
Esempio n. 22
0
def train_and_val(features_train,labels_train,features_val,labels_val, ml_engine, feature_group = None):
    '''
    Parameters
    ----------
    features_train : numpy 2d array. training feature matrix
    labels_train : numpy 1d array. training labels. 0 for negative and 1 for positive
    features_val : numpy 2d array. validation feature matrix
    labels_val : numpy 1d array. validation labels. 0 for negative and 1 for positive
    ml_engine: string. machine learning engine to use. Possible choice : 1) 'lrlasso': logistic lasso
                                                                          2) 'lglasso' logistic group lasso
                                                                          3) 'lgen': logistic elastic net
                                                                          4) 'grrf' : guided regularized random forest
                                                                          5) 'lsvm': linear support vector machine
                                                                          6) 'lgpcc': logistic regression + pearson correlation coefficient
    
    Returns
    -------
    best_model : the best model selected from validation
    best_param : the best hyper-parameters from validation
    feature_num : number of selected features from best model
    '''
    
    # Train model and get predicted score
    if ml_engine == "lrlasso" or ml_engine == "lglasso":

        # Convert to R objects
        features_train = BASE.as_matrix(features_train)
        features_val = BASE.as_matrix(features_val)

        # gglasso requires negative class to have label '-1'. Replace 0 with -1
        # For convenience of comparison, labels_val will not be converted to R object
        labels_train[labels_train == 0] = -1
        labels_val[labels_val == 0] = -1
        labels_train = BASE.as_vector(labels_train)
        labels_val = labels_val.reshape(labels_val.shape[0],1)

        # To avoid name conflict with python keyword 'lambda', use dictionary to pass function argument
        if ml_engine == "lrlasso":
            args = {'x':features_train, 'y':labels_train, 'loss':'logit', 'lambda.factor':0.01}
        elif ml_engine == "lglasso":
            args = {'x':features_train, 'y':labels_train, 'group':BASE.as_vector(feature_group), 'loss':'logit', 'lambda.factor':0.01}

        # Train model on training data set
        best_model = GGL.gglasso(**args)

        # Predict on validation data set
        pred = GGL.predict_gglasso(best_model,type = 'class',newx = features_val)
        pred = np.array(pred)
        
        # Get sequence of lambdas
        lambda_seq = np.array(best_model[best_model.names.index('lambda')])
        
        # Get the lambda which gives highest accuracy on validation dataset
        best_idx = np.argmax(np.sum(pred == labels_val,axis = 0))
        best_param = lambda_seq[best_idx]
        
        # Get number of selected features from the best model
        coef = np.array(best_model[best_model.names.index("beta")])[:,best_idx]
        feature_num = coef[coef != 0].shape[0]
        
    elif ml_engine == "lgen":
        
        # Generate sequence of two parameters: alpha and l1_ratio
        alpha_list = np.logspace(-3,3,5)
        l1_ratio_list = np.linspace(0,1,5)
        
        best_acc = 0
        best_param = None
        best_model = None
        
        # Test the different alpha and l1_ratio on validation dataset
        for alpha in alpha_list:
            for l1_ratio in l1_ratio_list:
                lgen = SGDClassifier(loss = 'log',penalty = 'elasticnet', alpha = alpha, l1_ratio = l1_ratio).fit(features_train, labels_train)
                pred = lgen.predict(features_val)
                acc = np.sum(labels_val == pred)
                if acc > best_acc:
                        best_acc = acc
                        best_param = (alpha,l1_ratio)
                        best_model = lgen
                        
        # Get number of selected features from the best model
        feature_num = best_model.coef_[best_model.coef_ != 0].shape[0]
        
    elif ml_engine == "grrf":
        
        # Convert to R objects
        features_train = BASE.as_matrix(features_train)
        features_val = BASE.as_matrix(features_val)
        labels_train = BASE.as_vector(labels_train)
        
        rf = RRF.RRF(features_train,BASE.as_factor(labels_train), flagReg = 0, ntree = 100) # build an ordinary RF 

        # Get importance score
        imp = rf[rf.names.index("importance")] 
        imp = np.array(imp)
        imp = imp/(np.max(imp))
        best_acc = 0
        best_param = None
        best_model = None
        
        # Test different gamma on validation dataset
        for gamma in (0,0.5,1):
            coefReg = (1-gamma) + gamma*imp
            coefReg = FloatVector(coefReg)
            grrf = RRF.RRF(features_train,BASE.as_factor(labels_train), flagReg=1, coefReg=coefReg, ntree = 100)
            pred = np.array(RRF.predict_RRF(grrf, features_val)) - 1
            acc = np.sum(labels_val == pred)
            if acc > best_acc:
                best_acc = acc
                best_param = gamma
                best_model = grrf
        
        # Get number of selected features from the best model
        feature_num = np.array(best_model[best_model.names.index("feaSet")]).shape[0]
    
    elif ml_engine == "lsvm":
        '''
        Calculate the lower bound of C for a null model
        If C goes smaller than this value, the model would 
        end up selecting no features
        '''
        min_c = l1_min_c(features_train, labels_train)

        # log spaced list of C parameters
        c_list = np.logspace(np.log10(min_c),3,10)
        best_acc = 0
        best_param = None
        best_model = None

        # Train on training dataset, validate each C on validation dataset
        for C in c_list:
            svm = LinearSVC(C = C,penalty = 'l1',dual=False).fit(features_train,labels_train)
            pred = svm.predict(features_val)
            acc = np.sum(labels_val == pred)
            if acc > best_acc:
                best_acc = acc
                best_param = C
                best_model = svm
        
        # Get number of selected features from the best model
        feature_num = best_model.coef_[best_model.coef_ != 0].shape[0]
        
    elif ml_engine == "lgpcc":
        
        cor_coef = []
        for i in range(features_train.shape[1]):
            x = features_train[:,i].astype(np.float32)
            y = labels_train.astype(np.float32)
            cor_coef.append(pearsonr(x,y)[0])
        
        cor_coef = np.array(cor_coef)
        order = np.argsort(np.abs(cor_coef))[-1:0:-1]
        to_include = np.arange(50,order.shape[0],50)
        best_acc = 0
        best_param = None
        best_model = None
        
        for i in to_include:
            lg = LogisticRegression().fit(features_train[:,order[:i]],labels_train)
            pred = lg.predict(features_val[:,order[:i]])
            acc = np.sum(pred == labels_val)
            if acc > best_acc:
                best_acc = acc
                best_param = order[:i]
                best_model = lg
                
        # Get number of selected features from the best model
        feature_num = best_model.coef_[best_model.coef_ != 0].shape[0]
        
    return(best_model,best_param,feature_num)
Esempio n. 23
0
X = np.array([np.array(xi) for xi in X])
# Add subject number as feature
#X = np.hstack([X, trials['subject'].reshape(-1,1)])
y = trials['condition'] == 'win_event'
y = y.astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Create model
model = LogisticRegression(random_state=0,
                           penalty='l1',
                           solver='liblinear',
                           tol=1e-6,
                           max_iter=int(1e6),
                           warm_start=True,
                           intercept_scaling=10000.)
# Regularization parameter
cs = l1_min_c(X_train, y_train, loss='log') * np.logspace(0, 3, 16)
coefs_ = []
accs = []
# Iterate over smaller subset for parameter search to reduce computational time
X_train_subset = X_train[:5000]
Y_train_subset = y_train[:5000]
for c in cs:
    model.set_params(C=c)
    model.fit(X_train_subset, Y_train_subset)
    coefs_.append(model.coef_.ravel().copy())
    accs.append(model.score(X_test, y_test))

# Plot coefs
coefs_ = np.array(coefs_)
plt.plot(np.log10(cs), coefs_, marker='o')
ymin, ymax = plt.ylim()
Esempio n. 24
0
        f1_tests = list()
        acc_trains = list()
        acc_tests = list()
        for i in range(10):
            # Sample and split data
            sampls = get_group_samples(scenario,
                                       feat_keys,
                                       n_labl,
                                       sample_size,
                                       undersampling=True)
            feat_train = np.vstack([s[0] for s in sampls[1:]])
            labl_train = np.hstack([s[1] for s in sampls[1:]])
            # feat_train = np.vstack([s[0] for s in [sampls[0]] + sampls[2:]])
            # labl_train = np.hstack([s[1] for s in [sampls[0]] + sampls[2:]])

            c = (l1_min_c(feat_train, labl_train, loss='log') *
                 np.logspace(0, 4, 5)).tolist()[1]
            # if sample_size > 100:  # Fix for PH-Breuer
            #     c = 0.02761796
            # Learn identifier
            idf = logReg(
                random_state=False,
                fit_intercept=False,
                class_weight='none',
                max_iter=max_iter,
                penalty=penalty,
                solver=solver,
                C=c,
                l1_ratio=l1_ratio,  # 0.0=l2, 1.0=l1
                verbose=False,
                n_jobs=-1,
def get_C_grid(X, y):
	c_grid = l1_min_c(X, y, loss='log') * np.logspace(0, 3, 100)
	return c_grid
Esempio n. 26
0
from sklearn import datasets
from sklearn.svm import l1_min_c

iris = datasets.load_iris()
X = iris.data
y = iris.target

X = X[y != 2]
y = y[y != 2]

X /= X.max()  # Normalize X to speed-up convergence

# #############################################################################
# Demo path functions

cs = l1_min_c(X, y, loss="log") * np.logspace(0, 7, 16)

print("Computing regularization path ...")
start = time()
clf = linear_model.LogisticRegression(
    penalty="l1",
    solver="liblinear",
    tol=1e-6,
    max_iter=int(1e6),
    warm_start=True,
    intercept_scaling=10000.0,
)
coefs_ = []
for c in cs:
    clf.set_params(C=c)
    clf.fit(X, y)
Esempio n. 27
0
regenerate_tsfresh = True
if regenerate_tsfresh:
    print('Generating tsfresh data...')
    settings = EfficientFCParameters()
    audio_tsfresh = extract_relevant_features(all_audio,
                                              all_labels,
                                              column_id='file_id',
                                              column_sort='time_id',
                                              default_fc_parameters=settings)
else:
    print('Reading tsfresh data...')
    all_labels = pd.read_pickle('pkl/drum_tsfresh_labels.pkl')
    audio_tsfresh = pd.read_pickle('pkl/drum_tsfresh.pkl')

print('Running logistic regression CV...')
print('Started CV %s' % datetime.now())
cs = l1_min_c(audio_tsfresh, all_labels, loss='log') * np.logspace(0, 7, 16)
cv_result = LogisticRegressionCV(Cs=cs,
                                 penalty='l1',
                                 multi_class='ovr',
                                 solver='saga',
                                 tol=1e-6,
                                 max_iter=int(1e6),
                                 n_jobs=-1).fit(audio_tsfresh, all_labels)
print('Done CV %s' % datetime.now())

print('Dumping results...')
Path("pkl").mkdir(exist_ok=True)
all_labels.to_pickle('pkl/drum_tsfresh_labels.pkl')
audio_tsfresh.to_pickle('pkl/drum_tsfresh.pkl')
dump(cv_result, 'pkl/drum_logreg_cv.joblib')
Esempio n. 28
0
gc.collect()

# ---------------------------------------------------------------------
# Grid Search

# Scaling
ss = StandardScaler()
mm = MinMaxScaler()
ss.fit(pd.concat([train_x, test_x], axis=0))
train_x_s = ss.transform(train_x)
test_x_s = ss.transform(test_x)
mm.fit(pd.concat([train_x, test_x], axis=0))
train_x_m = mm.transform(train_x)
test_x_m = mm.transform(test_x)

cs = l1_min_c(X, y, loss='log')  # lower limit of 'c' in L1 regression
param_grid = {'penalty': ['l1'], 'C': [0.1, 0.2]}
grid_cv_logit = GridSearchCV(
    LogisticRegression(
        solver='saga',  # L1 : sag, L2 :saga, both algo need Scaling
        random_state=SEED,
        n_jobs=1),
    param_grid=param_grid,
    scoring='roc_auc',
    n_jobs=CPU,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED),
    verbose=1,
)

grid_cv_logit.fit(train_x_s, train_y.values.reshape(-1, ))
# grid_cv_logit.fit(train_x_m, train_y.values.reshape(-1, ))
from sklearn import datasets
from sklearn.svm import l1_min_c

iris = datasets.load_iris()
X = iris.data
y = iris.target

X = X[y != 2]
y = y[y != 2]

X -= np.mean(X, 0)

###############################################################################
# Demo path function

cs = l1_min_c(X, y, loss="log") * np.logspace(0, 3)

print("Computing regularization path ...")
start = datetime.now()
clf = linear_model.LogisticRegression(C=1.0, penalty="l1", tol=1e-6)
coefs_ = []
for c in cs:
    clf.set_params(C=c)
    clf.fit(X, y)
    coefs_.append(clf.coef_.ravel().copy())
print("This took %s" % (datetime.now() - start))

coefs_ = np.array(coefs_)
plt.plot(np.log10(cs), coefs_)
ymin, ymax = plt.ylim()
plt.xlabel("log(C)")
Esempio n. 30
0
	def trainModel(self, do_pca = False,out_dir='./cache', rftop = 40, class_labels = SP.array(['G1','S','G2M']), cv=10, npc=3, is_SVM=1, is_RFE=0 , scale=False):
		if not os.path.exists(out_dir):
			os.makedirs(out_dir)	
		CFG = {}
		CFG['is_RFE'] = is_RFE # use recursive feature selection (can be slow for large datasets)
		CFG['is_SVM'] = is_SVM # use SVM with univariate feature selection (faster than RFE)
		CFG['CV_inner'] = cv #inner CV for RFE_CV: either an int or 'LOOCV'
		CFG['out_dir'] = out_dir
		CFG['do_pca'] = do_pca
		CFG['lassotop'] = 20
		self.cv = cv
		Y = self.Y
		labels = self.labels
		var_names = self.geneNames
		numClasses = self.numClasses
		predRF = SP.zeros((len(labels),numClasses))
		predSVM = SP.zeros((len(labels),numClasses)) 
		predSVMrbf = SP.zeros((len(labels),numClasses))
		predGNB = SP.zeros((len(labels),numClasses))
		predLR = SP.zeros((len(labels),numClasses))
		predLRall = SP.zeros((len(labels),numClasses))
		names_dict={}
		if self.cv == 'LOOCV':
			loo = LeaveOneOut(len(labels))
			CV_list = (list(iter(loo)))
			CV_list.append((SP.array(range(Y.shape[0])), SP.array(range(Y.shape[0]))))#all data...
		else:
			skf = StratifiedKFold(labels, n_folds=self.cv)
			CV_list = (list(iter(skf)))
			CV_list.append((SP.array(range(Y.shape[0])), SP.array(range(Y.shape[0]))))#all data...
		lambda_best = SP.zeros((1,len(CV_list))).ravel()	
		print("Performing cross validation ...")
		for i in range(len(CV_list)):
			if i<len(CV_list)-1:
				print("Fold " + str(i+1) + " of " + str(len(CV_list)-1))
			else:
				print("Final model")
			# string label for this fold
			#get data of a CV run
			cv_tr = CV_list[i][0]
			cv_tst = CV_list[i][1]
			lab_tr = labels[cv_tr]
			Ytr = Y[cv_tr,:]
			Ytst = Y[cv_tst,:]
			lab_tst = labels[cv_tst]
			if (i==len(CV_list)-1):
				foldlabel = 'full'
				if (self.Y_tst==None):
					Ytst = Y[cv_tst,:]
					lab_tst = labels[cv_tst]
				else:
					foldlabel = 'Test'
					Ytst = self.Y_tst
					lab_tst = self.labels_tst
			else:
				foldlabel = str(i)	
			if do_pca>=1:
				npc = npc#3	
				#do PCA to get features
				pcaCC = PCA(n_components=npc, whiten=False)
				pcaCC.fit(Ytr)
				pcaTst=pcaCC.transform(Ytst)
				pcaTr=pcaCC.transform(Ytr)
				#selection = SelectKBest(k=1)
				#combined_features = FeatureUnion([("pca", pcaCC), ("univ_select", selection)])
				combined_features = FeatureUnion([("pca", pcaCC)])
				gnb = GaussianNB()
				y_pred = gnb.fit(pcaTr, lab_tr).predict_proba(pcaTst)
				if i<len(CV_list)-1:
					predGNB[cv_tst,:] =y_pred#[:,1]
				else:
					predGNB_ts = y_pred#[:,1]
			if do_pca==2:
				Ytr = SP.concatenate((Ytr, pcaTr),1)
				Ytst = SP.concatenate((Ytst, pcaTst),1)
				pcnames = []
				for pci in range(npc):
					pcnames.append('PC'+str(pci+1))
				var_names = SP.concatenate((var_names, SP.array(pcnames)),1)				
			print("  Computing random forest ...")
			
			if CFG['is_RFE']==1:#Recursive feature selection with SVM
				print("  Computing RFE with SVM ...")
				svc = SVC(kernel="linear", probability=False, class_weight='auto')#use linear SVM for selection
				rfecv = RFECV(estimator=svc, step=1,scoring='f1')
				param_grid = dict(estimator__C=[0.1, 1, 10, 100, 1000])
				clf_rfe = GridSearchCV(rfecv, param_grid=param_grid, cv=3, scoring='f1')#GridSearch to find optimal parameters
				clf_rfe.fit(Ytr, lab_tr)
				svc = SVC(kernel="linear", probability=False,C=clf_rfe.best_estimator_.estimator.C, class_weight='auto')#use linear SVM for selection
				if CFG['CV_inner']=='':
					rfecv = RFECV(estimator=svc, step=1,scoring='f1')
				elif CFG['CV_inner']=='LOOCV':
					rfecv = RFECV(estimator=svc, step=1,scoring='f1', cv=LeaveOneOut(len(lab_tr)))
				else:
					rfecv = RFECV(estimator=svc, step=1,scoring='f1', cv=StratifiedKFold(lab_tr, n_folds=CFG['CV_inner']))
				clf_rfe.best_estimator_.fit(Ytr, lab_tr)
				predicted = clf_rfe.best_estimator_.predict(Ytst)
				if i<len(CV_list)-1:
					predSVM[cv_tst,:] = predicted
				else:
					predSVM_ts[cv_tst] = predicted
				classifier = svm.SVC(kernel='rbf', gamma=0.05, class_weight='auto', probability=True)#rbf kernel for prediction
				param_grid = dict(C=[0.1, 1], gamma=[1e-1,1e-2,1e-3])
				clf_rbf = GridSearchCV(classifier, param_grid=param_grid, cv=3, scoring='f1')
				clf_rbf.fit(Ytr[:,clf_rfe.best_estimator_.ranking_==1], lab_tr)
				clf_rbf.best_estimator_.fit(Ytr[:,clf_rfe.best_estimator_.ranking_==1], lab_tr)
				predicted = clf_rbf.best_estimator_.predict_proba(Ytst[:,clf_rfe.best_estimator_.ranking_==1])
				if i<len(CV_list)-1:
					predSVMrbf[cv_tst,:] = predicted
				fpr, tpr, thresholds = metrics.roc_curve(lab_tst, predicted[:,1])
				if (i==len(CV_list)-1) | CFG["CV_plots"]>0:
					PL.figure()
					PL.plot(fpr, tpr)
					PL.savefig(CFG['out_dir']+'/RF_SVM_'+foldlabel+'.pdf')
					names_dict[foldlabel+'_SVM']=self.geneNames[clf_rfe.best_estimator_.ranking_==1]			
			elif CFG['is_SVM']==1:#univariate FS with rbf SVM; choose this if you hava a large data set (many features, eg RNAseq)
				print("  SVM feature selection ...")
				classifier = svm.SVC(kernel='rbf', gamma=0.05, class_weight='auto', probability=True)
				selection = SelectKBest(k=1)
				combined_features = FeatureUnion([("univ_select", selection)])				
				
				X_features = combined_features.fit(Ytr, lab_tr).transform(Ytr)
				scaler = preprocessing.StandardScaler().fit(Ytr)
				YtrS = scaler.transform(Ytr)
				YtstS = scaler.transform(Ytst)
				
				classifier.fit(X_features, lab_tr)
				pipeline = Pipeline([("features", combined_features), ("svm", classifier)])
				if CFG['do_pca']==3:
					param_grid = dict(features__pca__n_components=SP.unique(SP.round_(SP.logspace(1.0,max(SP.log2(Ytr.shape[1]), SP.log2(10)),num=min(5,Ytr.shape[1]),base=2.0))),
									  features__univ_select__k=SP.unique(SP.round_(SP.logspace(3.0,SP.log2(Ytr.shape[1]),num=min(10,Ytr.shape[1]),base=2.0))),
									  svm__C=[0.1, 1, 10], svm__gamma=[1e-1,1e-2,1e-3])
				else:
					C_range = 10. ** SP.arange(0, 2)
					gamma_range = 10. ** SP.arange(-5, 1)
					param_grid = dict(features__univ_select__k=SP.unique(SP.round_(SP.logspace(3.0,SP.log2(Ytr.shape[1]),num=min(10,Ytr.shape[1]),base=2.0))),
									  svm__C=C_range, svm__gamma=gamma_range)
				clf = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='f1')
				clf.fit(YtrS, lab_tr)
				print("The best classifier is: ", clf.best_estimator_)
				select_best=clf.best_estimator_.get_params()['features__univ_select']
				#names_dict[foldlabel+'_SVM']=self.geneNames[SP.argsort(-1.0*select_best.scores_)[0:(select_best.k-1)]]
				expected = lab_tst
				predicted = clf.best_estimator_.predict_proba(YtstS)
				if i<len(CV_list)-1:
					predSVM[cv_tst,:] = predicted
				else:
					predSVM_ts = predicted
				#print(clf.best_estimator_)

				classifier = svm.SVC(kernel='rbf', gamma=0.05, class_weight='auto', probability=True)#rbf kernel for prediction
				param_grid = dict(C=[1,10], gamma=[ 1e-1,1e-2,1e-3])
				clf_rbf = GridSearchCV(classifier, param_grid=param_grid, cv=5, scoring='f1')
				clf_rbf.fit(Ytr, lab_tr)
				clf_rbf.best_estimator_.fit(Ytr, lab_tr)
				predicted = clf_rbf.best_estimator_.predict_proba(Ytst)
				if i<len(CV_list)-1:
					predSVMrbf[cv_tst,:] = predicted
				else:
					predSVMrbf_ts = predicted
		
			#do lasso with regularisation path
			cs = l1_min_c(Ytr, lab_tr, loss='log') * SP.logspace(0, 3)
			print("  Computing regularization path ...")
		
			lasso = linear_model.LogisticRegression(C=cs[0]*10.0, penalty='l1', tol=1e-6)
			param_grid = dict(C=cs)
			clf_lr = GridSearchCV(lasso, param_grid=param_grid, cv=5, scoring='f1')
			clf_lr.fit(Ytr, lab_tr)
			clf_lr.best_estimator_.fit(Ytr, lab_tr)
			lambda_best[i] = clf_lr.best_params_.get('C')
			predicted = clf_lr.best_estimator_.predict_proba(Ytst)

			clf = linear_model.LogisticRegression(C=cs[0]*10.0, penalty='l1', tol=1e-6)
			coefs_ = []
			for c in cs:
				clf.set_params(C=c)
				clf.fit(Ytr, lab_tr)
				coefs_.append(clf.coef_.ravel().copy())
		
		
			if i<len(CV_list)-1:
				predLR[cv_tst,:] = predicted
			else:
				predLR_ts = predicted
			coefs_ = SP.array(coefs_)
			# get ordering by importance (how many times they appear)
			order=(coefs_!=0).sum(axis=0).argsort()
			order=order[::-1] # descending
			# store this order
			featrank_lasso = order
			showtop= min(Ytr.shape[1], CFG['lassotop'])

			clfAll = linear_model.LogisticRegression(C=1e5, penalty='l2', tol=1e-6)
			clfAll.fit(Ytr, lab_tr)
			predicted = clfAll.predict_proba(Ytst)
			if i<len(CV_list)-1:
				predLRall[cv_tst,:] = predicted
			else:
				predLRall_ts = predicted
			forest = ExtraTreesClassifier(n_estimators=500,
										  random_state=0, criterion="entropy", bootstrap=False)
			#forest = RandomForestClassifier(n_estimators=500,
			#							 random_state=0, criterion="entropy")
			forest.fit(Ytr, lab_tr)
			pred = forest.predict_proba(Ytst)
			#pdb.set_trace()
			if i<len(CV_list)-1:
				predRF[cv_tst,:] = pred#[:,1]
			else:
				predRF_ts = pred#[:,1]
			importances = forest.feature_importances_
			std = SP.std([tree.feature_importances_ for tree in forest.estimators_],
						 axis=0)
		
			topfeat=min(Ytr.shape[1], rftop)
			indices = SP.argsort(importances)[::-1][0:topfeat]
			# store full feature ranking
			featrank_rf = SP.argsort(importances)[::-1]
			# Plot the feature importances of the forest
			if (i==len(CV_list)-1):
				PL.figure()
				#PL.title("Feature importances, Fold "+foldddPPlabel+', AUC='+str(SP.round_(metrics.auc(fpr, tpr),3)))
				PL.title("Feature importances")
				#PL.bar(range(topfeat), importances[indices],color="r", yerr=std[indices], align="center")
				PL.bar(range(topfeat), importances[indices],color="r", align="center")
				PL.xticks(range(topfeat), indices, rotation=70)
				PL.gca().set_xticklabels(var_names[indices])
				PL.setp(PL.gca().get_xticklabels(), fontsize=8)
				PL.xlim([-1, topfeat])
				PL.savefig(out_dir+'/RF_featureimportance_'+foldlabel+'.pdf')
	
	
		f2 = open(os.path.join(out_dir,'classification_reportCV.txt')  ,'w')
		
		predRFv = SP.argmax(predRF_ts,axis=1)+1
		predRF_trv = SP.argmax(predRF,axis=1)+1
		self.scores = predRF
		self.scores_tst = predRF_ts
		self.ranking = var_names[indices]

		predLRv = SP.argmax(predLR_ts,axis=1)+1
		predLR_trv = SP.argmax(predLR,axis=1)+1
		self.scoresLR = predLR
		self.scoresLR_tst = predLR_ts
		
		predLRallv = SP.argmax(predLRall_ts,axis=1)+1
		predLRall_trv = SP.argmax(predLRall,axis=1)+1
		self.scoresLRall = predLRall
		self.scoresLRall_tst = predLRall_ts
		if CFG['is_SVM']==1:
			predSVMv = SP.argmax(predSVM_ts,axis=1)+1
			predSVM_trv = SP.argmax(predSVM,axis=1)+1
			self.scoresSVM = predSVM
			self.scoresSVM_tst = predSVM_ts
			
			predSVMrbfv = SP.argmax(predSVMrbf_ts,axis=1)+1
			predSVMrbf_trv = SP.argmax(predSVMrbf,axis=1)+1
			self.scoresSVMrbf = predSVMrbf
			self.scoresSVMrbf_tst = predSVMrbf_ts

		predGNBv = SP.argmax(predGNB_ts,axis=1)+1
		predGNB_trv = SP.argmax(predGNB,axis=1)+1
		self.scoresGNB = predGNB
		self.scoresGNB_tst = predGNB_ts

		print("Classification report for classifier %s:\n%s\n" % ('Gaussian Naive Bayes', metrics.classification_report(self.labels, predGNB_trv)))
		print("Classification report for classifier %s:\n%s\n" % ('Random Forest', metrics.classification_report(self.labels, predRF_trv)))
		print("Classification report for classifier %s:\n%s\n" % ('LR', metrics.classification_report(self.labels, predLR_trv)))
		print("Classification report for classifier %s:\n%s\n" % ('LRall', metrics.classification_report(self.labels, predLRall_trv)))
		if CFG['is_RFE']==1:
			print("Classification report for classifier %s:\n%s\n" % ('SVM ', metrics.classification_report(labels, predSVM>0.5)),file=f2)
		elif CFG['is_SVM']==1:
			print("Classification report for classifier %s:\n%s\n" % ('SVM', metrics.classification_report(self.labels, predSVM_trv)))
			print("Classification report for classifier %s:\n%s\n" % ('SVMrbf', metrics.classification_report(self.labels, predSVMrbf_trv)))
		f2.close()
Esempio n. 31
0
    def fit(self,xw,xwl2,y,gs=4,retrain_l1=False):
        print 'Stage 1'
        if self.stage1_model_type == 'logit':
            clf = LogisticRegression(C=1,class_weight='balanced',penalty='l2',max_iter=300)
        elif self.stage1_model_type == 'svm':
            #clf = SVC(kernel='linear', class_weight='balanced', C=.1,probability=False)
            clf = SVC(C=1.,cache_size=500,kernel='linear',class_weight='balanced',probability=False)
        elif self.stage1_model_type == 'rf':
            clf = RandomForestClassifier(n_estimators=20,class_weight='balanced')

        # Stage 1
        #param_grid = dict(C=(np.array([5,3,1])))
        if self.stage1_model_type == 'logit':
            #param_grid = dict(C=(10**np.arange(1.,-2.,-0.5)))
            #param_grid = dict(C=(np.logspace(-.2, 1., 15)))
            #param_grid = dict(C=(np.arange(3,1,-0.5)))
            param_grid = dict(C=(5,5.0001))
        elif self.stage1_model_type =='svm':
            param_grid = dict(C=(np.arange(3.5,0.,-0.5)))
            param_grid = dict(C=(1.,1.00001))
            #param_grid = dict(C=(np.logspace(-1.5, 0, 10)))
            #param_grid = dict(C=(np.arange(2.,0.5,-0.05)))
            #param_grid = dict(C=(np.array([0.01, 0.1, 1, 10, 100, 1000])))
        elif self.stage1_model_type == 'rf':
            param_grid = dict(n_estimators=(20,10))

        gridclf = GridSearchCV(clf, param_grid=param_grid, cv=StratifiedKFold(n_splits=gs), n_jobs=-1,scoring='accuracy')
        gridclf.fit(xw,y)
        self.clf1 = gridclf.best_estimator_
        if self.verbose:
            print self.clf1
            #print self.clf1.coef_
        #hm_y,y_pred_train = self.estimate_hitmiss(xw,y)
        hm_y,proba = self.suffle_hm(xw,y,gamma=self.gamma,n_iter=100)
        hm_y,auto_gamma = self.auto_gamma(proba,self.gamma)
        self.auto_gamma = auto_gamma
        if self.verbose: proba
        if self.verbose: print 'Average hm score', np.mean(hm_y)
        #self.clf3 = SVC(C=1.,cache_size=500,kernel='linear',class_weight='balanced',probability=False)
        #gamma=0.5
        #print 'n stage3 ',(proba>gamma).sum()
        #self.clf3.fit(xw[proba>gamma,:],y[proba>gamma])
        #if retrain_l1:
        #    self.clf1 = self.clf3
        print 'Stage 2'
        #Stage 2
        min_c = l1_min_c(xwl2,hm_y,loss='log')
        #clf2 = LogisticRegression(C=10**0.1,class_weight=None,penalty='l2',solver='sag')
        #clf2 = LogisticRegression(C=1,class_weight=None,penalty='l2',solver='sag',max_iter=300)
        #clf2 = LinearSVC(class_weight='balanced',penalty='l1',dual=False)
        clf2 = LogisticRegression(C=1.,class_weight='balanced',penalty='l1',solver='liblinear',max_iter=300)
        #clf2 = LogisticRegression(C=1.,class_weight='balanced',penalty='l2',solver='sag',max_iter=300)
        #clf2 = SVC(C=1.,cache_size=500,kernel='linear',class_weight='balanced')
        #clf2 = RandomForestClassifier(n_estimators=20,class_weight='balanced')
        #param_grid = dict(C=(10**np.arange(1.,-2.,-0.5)))
        #param_grid = dict(C=(np.arange(3,1,-0.5)))
        #param_grid = dict(C=(np.logspace(-0.5, 2., 30)))
        #param_grid = dict(C=(np.logspace(1., -2., 15)))
        #if min_c>(10**-0.2):
        #    param_grid = dict(C=(np.logspace(np.log10(min_c), 1, 15)))
        #else:
        param_grid = dict(C=(np.logspace(-.2, 1, 15)))
        #param_grid = dict(C=(np.logspace(-.1, 0.5, 30)))
        #param_grid = dict(C=(np.logspace(0,0.00001, 2)))
        #param_grid = dict(C=(np.logspace(np.log10(min_c), 0., 15)))
        #param_grid = dict(C=(1,1.10001)) 
        #param_grid = dict(n_estimators=(20,10))

        # 2 levels balancing 
        '''
        new_classes = np.zeros_like(y)
        new_classes[(y==0) & (hm_y==0)]=0
        new_classes[(y==1) & (hm_y==0)]=1
        new_classes[(y==0) & (hm_y==1)]=2
        new_classes[(y==1) & (hm_y==1)]=3

        tmp_samp_w = len(new_classes) / (len(np.unique(new_classes))*1. * np.bincount(new_classes))
        tmp_samp_w = (1.*(tmp_samp_w/tmp_samp_w.sum()))
        sample_w = new_classes.copy().astype(float)
        sample_w[new_classes==0] = tmp_samp_w[0]
        sample_w[new_classes==1] = tmp_samp_w[1]
        sample_w[new_classes==2] = tmp_samp_w[2]
        sample_w[new_classes==3] = tmp_samp_w[3]
        '''
        new_classes = np.zeros_like(y)
        new_classes[(y==0) ]=0
        new_classes[(y==1) ]=1

        tmp_samp_w = len(new_classes) / (len(np.unique(new_classes))*1. * np.bincount(new_classes))
        #tmp_samp_w = (1.*(tmp_samp_w/tmp_samp_w.sum()))
        sample_w = new_classes.copy().astype(float)
        sample_w[new_classes==0] = tmp_samp_w[0]
        sample_w[new_classes==1] = tmp_samp_w[1]

        #gridclf = GridSearchCV(clf2, param_grid=param_grid, cv=StratifiedKFold(hm_y,n_folds=gs),fit_params=dict(sample_weight=sample_w), n_jobs=-1,scoring='accuracy')
        #gridclf = GridSearchCV(clf2, param_grid=param_grid, cv=StratifiedKFold(hm_y,n_folds=gs),fit_params=dict(sample_weight=proba), n_jobs=-1,scoring='accuracy')
        #gridclf = GridSearchCV(clf2, param_grid=param_grid, cv=StratifiedKFold(hm_y,n_folds=gs), n_jobs=-1,scoring='precision_weighted')
        #gridclf = GridSearchCV(clf2, param_grid=param_grid, cv=StratifiedKFold(hm_y,n_folds=gs), n_jobs=-1,scoring='accuracy')
        #gridclf = GridSearchCV(clf2, param_grid=param_grid, cv=StratifiedShuffleSplit(hm_y, n_iter=50, test_size=.2,random_state=1), n_jobs=-1,scoring='accuracy')#f1_weighted
        #gridclf = GridSearchCV(clf2, param_grid=param_grid, cv=StratifiedShuffleSplit(hm_y, n_iter=50, test_size=.2,random_state=1), n_jobs=-1,scoring='f1_weighted')
        gridclf = GridSearchCV(clf2, param_grid=param_grid, cv=StratifiedShuffleSplit(n_splits=50, test_size=.2,random_state=1), n_jobs=-1,scoring='precision_weighted')
        #gridclf = GridSearchCV(clf2, param_grid=param_grid, cv=StratifiedShuffleSplit(hm_y, n_iter=50, test_size=.2,random_state=1), n_jobs=-1,fit_params=dict(sample_weight=sample_w),scoring='precision_weighted')
        gridclf.fit(xwl2,hm_y)
        clf2 = gridclf.best_estimator_
        #clf2.fit(xw[train_index,:][:,idx_sz],hm_y)
        if self.verbose:
            print clf2
            print clf2.coef_

        self.clf2 = clf2
Esempio n. 32
0
    def trainModel(self,
                   do_pca=False,
                   out_dir='./cache',
                   rftop=40,
                   class_labels=SP.array(['G1', 'S', 'G2M']),
                   cv=10,
                   npc=3,
                   is_SVM=1,
                   is_RFE=0,
                   scale=False):
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        CFG = {}
        CFG['is_RFE'] = is_RFE  # use recursive feature selection (can be slow for large datasets)
        CFG['is_SVM'] = is_SVM  # use SVM with univariate feature selection (faster than RFE)
        CFG['CV_inner'] = cv  #inner CV for RFE_CV: either an int or 'LOOCV'
        CFG['out_dir'] = out_dir
        CFG['do_pca'] = do_pca
        CFG['lassotop'] = 20
        self.cv = cv
        Y = self.Y
        labels = self.labels
        var_names = self.geneNames
        numClasses = self.numClasses
        predRF = SP.zeros((len(labels), numClasses))
        predSVM = SP.zeros((len(labels), numClasses))
        predSVMrbf = SP.zeros((len(labels), numClasses))
        predGNB = SP.zeros((len(labels), numClasses))
        predLR = SP.zeros((len(labels), numClasses))
        predLRall = SP.zeros((len(labels), numClasses))
        names_dict = {}
        if self.cv == 'LOOCV':
            loo = LeaveOneOut(len(labels))
            CV_list = (list(iter(loo)))
            CV_list.append((SP.array(range(Y.shape[0])),
                            SP.array(range(Y.shape[0]))))  #all data...
        else:
            skf = StratifiedKFold(labels, n_folds=self.cv)
            CV_list = (list(iter(skf)))
            CV_list.append((SP.array(range(Y.shape[0])),
                            SP.array(range(Y.shape[0]))))  #all data...
        lambda_best = SP.zeros((1, len(CV_list))).ravel()
        print("Performing cross validation ...")
        for i in range(len(CV_list)):
            if i < len(CV_list) - 1:
                print("Fold " + str(i + 1) + " of " + str(len(CV_list) - 1))
            else:
                print("Final model")
            # string label for this fold
            #get data of a CV run
            cv_tr = CV_list[i][0]
            cv_tst = CV_list[i][1]
            lab_tr = labels[cv_tr]
            Ytr = Y[cv_tr, :]
            Ytst = Y[cv_tst, :]
            lab_tst = labels[cv_tst]
            if (i == len(CV_list) - 1):
                foldlabel = 'full'
                if (self.Y_tst == None):
                    Ytst = Y[cv_tst, :]
                    lab_tst = labels[cv_tst]
                else:
                    foldlabel = 'Test'
                    Ytst = self.Y_tst
                    lab_tst = self.labels_tst
            else:
                foldlabel = str(i)
            if do_pca >= 1:
                npc = npc  #3
                #do PCA to get features
                pcaCC = PCA(n_components=npc, whiten=False)
                pcaCC.fit(Ytr)
                pcaTst = pcaCC.transform(Ytst)
                pcaTr = pcaCC.transform(Ytr)
                #selection = SelectKBest(k=1)
                #combined_features = FeatureUnion([("pca", pcaCC), ("univ_select", selection)])
                combined_features = FeatureUnion([("pca", pcaCC)])
                gnb = GaussianNB()
                y_pred = gnb.fit(pcaTr, lab_tr).predict_proba(pcaTst)
                if i < len(CV_list) - 1:
                    predGNB[cv_tst, :] = y_pred  #[:,1]
                else:
                    predGNB_ts = y_pred  #[:,1]
            if do_pca == 2:
                Ytr = SP.concatenate((Ytr, pcaTr), 1)
                Ytst = SP.concatenate((Ytst, pcaTst), 1)
                pcnames = []
                for pci in range(npc):
                    pcnames.append('PC' + str(pci + 1))
                var_names = SP.concatenate((var_names, SP.array(pcnames)), 1)
            print("  Computing random forest ...")

            if CFG['is_RFE'] == 1:  #Recursive feature selection with SVM
                print("  Computing RFE with SVM ...")
                svc = SVC(kernel="linear",
                          probability=False,
                          class_weight='auto')  #use linear SVM for selection
                rfecv = RFECV(estimator=svc, step=1, scoring='f1')
                param_grid = dict(estimator__C=[0.1, 1, 10, 100, 1000])
                clf_rfe = GridSearchCV(
                    rfecv, param_grid=param_grid, cv=3,
                    scoring='f1')  #GridSearch to find optimal parameters
                clf_rfe.fit(Ytr, lab_tr)
                svc = SVC(kernel="linear",
                          probability=False,
                          C=clf_rfe.best_estimator_.estimator.C,
                          class_weight='auto')  #use linear SVM for selection
                if CFG['CV_inner'] == '':
                    rfecv = RFECV(estimator=svc, step=1, scoring='f1')
                elif CFG['CV_inner'] == 'LOOCV':
                    rfecv = RFECV(estimator=svc,
                                  step=1,
                                  scoring='f1',
                                  cv=LeaveOneOut(len(lab_tr)))
                else:
                    rfecv = RFECV(estimator=svc,
                                  step=1,
                                  scoring='f1',
                                  cv=StratifiedKFold(lab_tr,
                                                     n_folds=CFG['CV_inner']))
                clf_rfe.best_estimator_.fit(Ytr, lab_tr)
                predicted = clf_rfe.best_estimator_.predict(Ytst)
                if i < len(CV_list) - 1:
                    predSVM[cv_tst, :] = predicted
                else:
                    predSVM_ts[cv_tst] = predicted
                classifier = svm.SVC(
                    kernel='rbf',
                    gamma=0.05,
                    class_weight='auto',
                    probability=True)  #rbf kernel for prediction
                param_grid = dict(C=[0.1, 1], gamma=[1e-1, 1e-2, 1e-3])
                clf_rbf = GridSearchCV(classifier,
                                       param_grid=param_grid,
                                       cv=3,
                                       scoring='f1')
                clf_rbf.fit(Ytr[:, clf_rfe.best_estimator_.ranking_ == 1],
                            lab_tr)
                clf_rbf.best_estimator_.fit(
                    Ytr[:, clf_rfe.best_estimator_.ranking_ == 1], lab_tr)
                predicted = clf_rbf.best_estimator_.predict_proba(
                    Ytst[:, clf_rfe.best_estimator_.ranking_ == 1])
                if i < len(CV_list) - 1:
                    predSVMrbf[cv_tst, :] = predicted
                fpr, tpr, thresholds = metrics.roc_curve(
                    lab_tst, predicted[:, 1])
                if (i == len(CV_list) - 1) | CFG["CV_plots"] > 0:
                    PL.figure()
                    PL.plot(fpr, tpr)
                    PL.savefig(CFG['out_dir'] + '/RF_SVM_' + foldlabel +
                               '.pdf')
                    names_dict[foldlabel + '_SVM'] = self.geneNames[
                        clf_rfe.best_estimator_.ranking_ == 1]
            elif CFG[
                    'is_SVM'] == 1:  #univariate FS with rbf SVM; choose this if you hava a large data set (many features, eg RNAseq)
                print("  SVM feature selection ...")
                classifier = svm.SVC(kernel='rbf',
                                     gamma=0.05,
                                     class_weight='auto',
                                     probability=True)
                selection = SelectKBest(k=1)
                combined_features = FeatureUnion([("univ_select", selection)])

                X_features = combined_features.fit(Ytr, lab_tr).transform(Ytr)
                scaler = preprocessing.StandardScaler().fit(Ytr)
                YtrS = scaler.transform(Ytr)
                YtstS = scaler.transform(Ytst)

                classifier.fit(X_features, lab_tr)
                pipeline = Pipeline([("features", combined_features),
                                     ("svm", classifier)])
                if CFG['do_pca'] == 3:
                    param_grid = dict(
                        features__pca__n_components=SP.unique(
                            SP.round_(
                                SP.logspace(1.0,
                                            max(SP.log2(Ytr.shape[1]),
                                                SP.log2(10)),
                                            num=min(5, Ytr.shape[1]),
                                            base=2.0))),
                        features__univ_select__k=SP.unique(
                            SP.round_(
                                SP.logspace(3.0,
                                            SP.log2(Ytr.shape[1]),
                                            num=min(10, Ytr.shape[1]),
                                            base=2.0))),
                        svm__C=[0.1, 1, 10],
                        svm__gamma=[1e-1, 1e-2, 1e-3])
                else:
                    C_range = 10.**SP.arange(0, 2)
                    gamma_range = 10.**SP.arange(-5, 1)
                    param_grid = dict(features__univ_select__k=SP.unique(
                        SP.round_(
                            SP.logspace(3.0,
                                        SP.log2(Ytr.shape[1]),
                                        num=min(10, Ytr.shape[1]),
                                        base=2.0))),
                                      svm__C=C_range,
                                      svm__gamma=gamma_range)
                clf = GridSearchCV(pipeline,
                                   param_grid=param_grid,
                                   cv=5,
                                   scoring='f1')
                clf.fit(YtrS, lab_tr)
                print("The best classifier is: ", clf.best_estimator_)
                select_best = clf.best_estimator_.get_params(
                )['features__univ_select']
                #names_dict[foldlabel+'_SVM']=self.geneNames[SP.argsort(-1.0*select_best.scores_)[0:(select_best.k-1)]]
                expected = lab_tst
                predicted = clf.best_estimator_.predict_proba(YtstS)
                if i < len(CV_list) - 1:
                    predSVM[cv_tst, :] = predicted
                else:
                    predSVM_ts = predicted
                #print(clf.best_estimator_)

                classifier = svm.SVC(
                    kernel='rbf',
                    gamma=0.05,
                    class_weight='auto',
                    probability=True)  #rbf kernel for prediction
                param_grid = dict(C=[1, 10], gamma=[1e-1, 1e-2, 1e-3])
                clf_rbf = GridSearchCV(classifier,
                                       param_grid=param_grid,
                                       cv=5,
                                       scoring='f1')
                clf_rbf.fit(Ytr, lab_tr)
                clf_rbf.best_estimator_.fit(Ytr, lab_tr)
                predicted = clf_rbf.best_estimator_.predict_proba(Ytst)
                if i < len(CV_list) - 1:
                    predSVMrbf[cv_tst, :] = predicted
                else:
                    predSVMrbf_ts = predicted

            #do lasso with regularisation path
            cs = l1_min_c(Ytr, lab_tr, loss='log') * SP.logspace(0, 3)
            print("  Computing regularization path ...")

            lasso = linear_model.LogisticRegression(C=cs[0] * 10.0,
                                                    penalty='l1',
                                                    tol=1e-6)
            param_grid = dict(C=cs)
            clf_lr = GridSearchCV(lasso,
                                  param_grid=param_grid,
                                  cv=5,
                                  scoring='f1')
            clf_lr.fit(Ytr, lab_tr)
            clf_lr.best_estimator_.fit(Ytr, lab_tr)
            lambda_best[i] = clf_lr.best_params_.get('C')
            predicted = clf_lr.best_estimator_.predict_proba(Ytst)

            clf = linear_model.LogisticRegression(C=cs[0] * 10.0,
                                                  penalty='l1',
                                                  tol=1e-6)
            coefs_ = []
            for c in cs:
                clf.set_params(C=c)
                clf.fit(Ytr, lab_tr)
                coefs_.append(clf.coef_.ravel().copy())

            if i < len(CV_list) - 1:
                predLR[cv_tst, :] = predicted
            else:
                predLR_ts = predicted
            coefs_ = SP.array(coefs_)
            # get ordering by importance (how many times they appear)
            order = (coefs_ != 0).sum(axis=0).argsort()
            order = order[::-1]  # descending
            # store this order
            featrank_lasso = order
            showtop = min(Ytr.shape[1], CFG['lassotop'])

            clfAll = linear_model.LogisticRegression(C=1e5,
                                                     penalty='l2',
                                                     tol=1e-6)
            clfAll.fit(Ytr, lab_tr)
            predicted = clfAll.predict_proba(Ytst)
            if i < len(CV_list) - 1:
                predLRall[cv_tst, :] = predicted
            else:
                predLRall_ts = predicted
            forest = ExtraTreesClassifier(n_estimators=500,
                                          random_state=0,
                                          criterion="entropy",
                                          bootstrap=False)
            #forest = RandomForestClassifier(n_estimators=500,
            #							 random_state=0, criterion="entropy")
            forest.fit(Ytr, lab_tr)
            pred = forest.predict_proba(Ytst)
            #pdb.set_trace()
            if i < len(CV_list) - 1:
                predRF[cv_tst, :] = pred  #[:,1]
            else:
                predRF_ts = pred  #[:,1]
            importances = forest.feature_importances_
            std = SP.std(
                [tree.feature_importances_ for tree in forest.estimators_],
                axis=0)

            topfeat = min(Ytr.shape[1], rftop)
            indices = SP.argsort(importances)[::-1][0:topfeat]
            # store full feature ranking
            featrank_rf = SP.argsort(importances)[::-1]
            # Plot the feature importances of the forest
            if (i == len(CV_list) - 1):
                PL.figure()
                #PL.title("Feature importances, Fold "+foldddPPlabel+', AUC='+str(SP.round_(metrics.auc(fpr, tpr),3)))
                PL.title("Feature importances")
                #PL.bar(range(topfeat), importances[indices],color="r", yerr=std[indices], align="center")
                PL.bar(range(topfeat),
                       importances[indices],
                       color="r",
                       align="center")
                PL.xticks(range(topfeat), indices, rotation=70)
                PL.gca().set_xticklabels(var_names[indices])
                PL.setp(PL.gca().get_xticklabels(), fontsize=8)
                PL.xlim([-1, topfeat])
                PL.savefig(out_dir + '/RF_featureimportance_' + foldlabel +
                           '.pdf')

        f2 = open(os.path.join(out_dir, 'classification_reportCV.txt'), 'w')

        predRFv = SP.argmax(predRF_ts, axis=1) + 1
        predRF_trv = SP.argmax(predRF, axis=1) + 1
        self.scores = predRF
        self.scores_tst = predRF_ts
        self.ranking = var_names[indices]

        predLRv = SP.argmax(predLR_ts, axis=1) + 1
        predLR_trv = SP.argmax(predLR, axis=1) + 1
        self.scoresLR = predLR
        self.scoresLR_tst = predLR_ts

        predLRallv = SP.argmax(predLRall_ts, axis=1) + 1
        predLRall_trv = SP.argmax(predLRall, axis=1) + 1
        self.scoresLRall = predLRall
        self.scoresLRall_tst = predLRall_ts
        if CFG['is_SVM'] == 1:
            predSVMv = SP.argmax(predSVM_ts, axis=1) + 1
            predSVM_trv = SP.argmax(predSVM, axis=1) + 1
            self.scoresSVM = predSVM
            self.scoresSVM_tst = predSVM_ts

            predSVMrbfv = SP.argmax(predSVMrbf_ts, axis=1) + 1
            predSVMrbf_trv = SP.argmax(predSVMrbf, axis=1) + 1
            self.scoresSVMrbf = predSVMrbf
            self.scoresSVMrbf_tst = predSVMrbf_ts

        predGNBv = SP.argmax(predGNB_ts, axis=1) + 1
        predGNB_trv = SP.argmax(predGNB, axis=1) + 1
        self.scoresGNB = predGNB
        self.scoresGNB_tst = predGNB_ts

        print("Classification report for classifier %s:\n%s\n" %
              ('Gaussian Naive Bayes',
               metrics.classification_report(self.labels, predGNB_trv)))
        print("Classification report for classifier %s:\n%s\n" %
              ('Random Forest',
               metrics.classification_report(self.labels, predRF_trv)))
        print("Classification report for classifier %s:\n%s\n" %
              ('LR', metrics.classification_report(self.labels, predLR_trv)))
        print("Classification report for classifier %s:\n%s\n" %
              ('LRall',
               metrics.classification_report(self.labels, predLRall_trv)))
        if CFG['is_RFE'] == 1:
            print(
                "Classification report for classifier %s:\n%s\n" %
                ('SVM ', metrics.classification_report(labels, predSVM > 0.5)),
                file=f2)
        elif CFG['is_SVM'] == 1:
            print("Classification report for classifier %s:\n%s\n" %
                  ('SVM',
                   metrics.classification_report(self.labels, predSVM_trv)))
            print("Classification report for classifier %s:\n%s\n" %
                  ('SVMrbf',
                   metrics.classification_report(self.labels, predSVMrbf_trv)))
        f2.close()
Esempio n. 33
0
    def fit(self, xw, xwl2, y, gs=4, model_type='logit', verbose=True):
        self.verbose = verbose
        if model_type == 'logit':
            clf = LogisticRegression(C=1,
                                     class_weight='balanced',
                                     penalty='l2',
                                     max_iter=300)
        else:
            #clf = SVC(kernel='linear', class_weight='balanced', C=.1,probability=False)
            clf = SVC(C=1.,
                      cache_size=500,
                      kernel='linear',
                      class_weight='balanced',
                      probability=False)
            #clf = RandomForestClassifier(n_estimators=500,class_weight='balanced')
        '''
        # wrapper feature selection
        rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(y, 3), scoring='f1')#accuracy
        rfecv.fit(xw, y)
        print("Optimal number of features : %d" % rfecv.n_features_)
        print("ids: {}".format((rfecv.ranking_<=5).sum()))
        print rfecv.grid_scores_
        self.rfecv = rfecv
        if rfecv.support_.sum()>10:
            self.w_select = rfecv.support_
        else:
            self.w_select = rfecv.ranking_<=10
        '''
        #xw = [:,self.w_select]

        #self.mask_selection = (np.ones((1,xw.shape[1]))==1)[0,:]
        ## Optimize the hyper parameters
        # Stage 1
        #param_grid = dict(C=(np.array([5,3,1])))
        '''
        if model_type=='logit':
            param_grid = dict(C=(10**np.arange(1.,-2.,-0.5)))
            #param_grid = dict(C=(np.logspace(-.2, 1., 15)))
            #param_grid = dict(C=(np.arange(3,1,-0.5)))
        else:
            param_grid = dict(C=(np.arange(3.5,0.,-0.5)))
            param_grid = dict(C=(1.,1.00001))
            #param_grid = dict(C=(np.logspace(-1.5, 0, 10)))
            #param_grid = dict(C=(np.arange(2.,0.5,-0.05)))
            #param_grid = dict(C=(np.array([0.01, 0.1, 1, 10, 100, 1000])))

        gridclf = GridSearchCV(clf, param_grid=param_grid, cv=StratifiedKFold(y,n_folds=gs), n_jobs=-1,scoring='accuracy')
        gridclf.fit(xw,y)
        self.clf1 = gridclf.best_estimator_
        '''
        self.clf1 = clf
        self.clf1.fit(xw, y)
        if self.verbose:
            print self.clf1
            print self.clf1.coef_
            #print self.clf1.feature_importances_
        #hm_y,y_pred_train = self.estimate_hitmiss(xw,y)
        hm_y, proba = self.suffle_hm(xw, y, gamma=.8, n_iter=100)
        #hm_y = self.clf1.predict(xw)
        print 'Stage 2'
        #Stage 2
        min_c = l1_min_c(xwl2, hm_y, loss='log')
        print 'minimum c: ', min_c
        #clf2 = LogisticRegression(C=10**0.1,class_weight=None,penalty='l2',solver='sag')
        #clf2 = LogisticRegression(C=1,class_weight=None,penalty='l2',solver='sag',max_iter=300)
        #clf2 = LinearSVC(class_weight='balanced',penalty='l1',dual=False)
        clf2 = LogisticRegression(C=1.,
                                  class_weight='balanced',
                                  penalty='l2',
                                  solver='liblinear',
                                  max_iter=300)
        #clf2 = LogisticRegression(C=1.,class_weight='balanced',penalty='l2',solver='sag',max_iter=300)
        #clf2 = SVC(C=1.,cache_size=500,kernel='linear',class_weight='balanced')
        #clf2 = RandomForestClassifier(n_estimators=500,class_weight='balanced',oob_score=True)
        #param_grid = dict(C=(10**np.arange(1.,-2.,-0.5)))
        #param_grid = dict(C=(np.arange(3,1,-0.5)))
        #param_grid = dict(C=(np.logspace(-0.5, 2., 30)))
        #param_grid = dict(C=(np.logspace(1., 1.6, 30)))
        param_grid = dict(C=(np.logspace(-.2, 1., 15)))
        #param_grid = dict(C=(np.logspace(-.15, 1., 15)))
        #param_grid = dict(C=(np.logspace(np.log10(min_c), 0., 15)))
        #param_grid = dict(C=(1,1.0001))
        # 2 levels balancing
        '''
        new_classes = np.zeros_like(y)
        new_classes[(y==0) & (hm_y==0)]=0
        new_classes[(y==1) & (hm_y==0)]=1
        new_classes[(y==0) & (hm_y==1)]=2
        new_classes[(y==1) & (hm_y==1)]=3

        tmp_samp_w = len(new_classes) / (len(np.unique(new_classes))*1. * np.bincount(new_classes))
        tmp_samp_w = (1.*(tmp_samp_w/tmp_samp_w.sum()))
        sample_w = new_classes.copy().astype(float)
        sample_w[new_classes==0] = tmp_samp_w[0]
        sample_w[new_classes==1] = tmp_samp_w[1]
        sample_w[new_classes==2] = tmp_samp_w[2]
        sample_w[new_classes==3] = tmp_samp_w[3]
        '''
        #gridclf = GridSearchCV(clf2, param_grid=param_grid, cv=StratifiedKFold(hm_y,n_folds=gs),fit_params=dict(sample_weight=sample_w), n_jobs=-1,scoring='accuracy')
        #gridclf = GridSearchCV(clf2, param_grid=param_grid, cv=StratifiedKFold(hm_y,n_folds=gs),fit_params=dict(sample_weight=proba), n_jobs=-1,scoring='accuracy')
        gridclf = GridSearchCV(clf2,
                               param_grid=param_grid,
                               cv=StratifiedKFold(hm_y, n_folds=gs),
                               n_jobs=-1,
                               scoring='accuracy')
        gridclf.fit(xwl2, hm_y)
        clf2 = gridclf.best_estimator_
        #clf2.fit(xwl2,hm_y)
        print 'class order : ', clf2.classes_
        # train right classifier on easy cases
        #self.clf_easy = SVC(C=1.,cache_size=500,kernel='linear',class_weight='balanced',probability=False)
        #l2_pred = clf2.predict(xwl2)
        #self.clf_easy.fit(xw[l2_pred>0,:],y[l2_pred>0])

        if self.verbose:
            print clf2
            print clf2.coef_

        self.clf2 = clf2
Esempio n. 34
0
def grid_search_lr_c(X_train,
                     y_train,
                     df_coef_path=False,
                     pic_coefpath_title='Logistic Regression Path',
                     pic_coefpath=False,
                     pic_performance_title='Logistic Regression Performance',
                     pic_performance=False):
    """
    grid search optimal hyper parameters c with the best ks performance
    :param X_train: features dataframe
    :param y_train: target
    :param df_coef_path: the file path for logistic regression coefficient dataframe
    :param pic_coefpath_title: the pic title for coefficient path picture
    :param pic_coefpath: the file path for coefficient path picture
    :param pic_performance_title: the pic title for ks performance picture
    :param pic_performance: the file path for ks performance picture
    :return: a tuple of c and ks value with the best ks performance
    """
    # init a LogisticRegression model
    clf_l1_LR = LogisticRegression(C=0.1,
                                   penalty='l1',
                                   tol=0.01,
                                   class_weight='balanced')
    cs = l1_min_c(X_train, y_train, loss='log') * np.logspace(0, 3)

    print("Computing regularization path ...")
    start = datetime.now()
    print start
    coefs_ = []
    ks = []
    for c in cs:
        clf_l1_LR.set_params(C=c)
        clf_l1_LR.fit(X_train, y_train)
        coefs_.append(clf_l1_LR.coef_.ravel().copy())

        proba = clf_l1_LR.predict_proba(X_train)[:, 1]
        ks.append(compute_ks(proba, y_train))

    end = datetime.now()
    print end
    print("This took ", end - start)
    coef_cv_df = pd.DataFrame(coefs_, columns=X_train.columns)
    coef_cv_df['ks'] = ks
    coef_cv_df['c'] = cs

    if df_coef_path:
        file_name = df_coef_path if isinstance(df_coef_path, str) else None
        coef_cv_df.to_csv(file_name)

    coefs_ = np.array(coefs_)

    fig1 = plt.figure('fig1')
    plt.plot(np.log10(cs), coefs_)
    ymin, ymax = plt.ylim()
    plt.xlabel('log(C)')
    plt.ylabel('Coefficients')
    plt.title(pic_coefpath_title)
    plt.axis('tight')
    if pic_coefpath:
        file_name = pic_coefpath if isinstance(pic_coefpath, str) else None
        plt.savefig(file_name)
    else:
        plt.show()

    fig2 = plt.figure('fig2')
    plt.plot(np.log10(cs), ks)
    plt.xlabel('log(C)')
    plt.ylabel('ks score')
    plt.title(pic_performance_title)
    plt.axis('tight')
    if pic_performance:
        file_name = pic_performance if isinstance(pic_performance,
                                                  str) else None
        plt.savefig(file_name)
    else:
        plt.show()

    flag = coefs_ < 0
    idx = np.array(ks)[flag.sum(axis=1) == 0].argmax()

    return (cs[idx], ks[idx])
Esempio n. 35
0
from sklearn import datasets
from sklearn.svm import l1_min_c

iris = datasets.load_iris()
X = iris.data
y = iris.target

X = X[y != 2]
y = y[y != 2]

X -= np.mean(X, 0)

################################################################################
# Demo path functions

cs = l1_min_c(X, y, loss='log') * np.logspace(0, 3)


print "Computing regularization path ..."
start = datetime.now()
clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
coefs_ = [clf.fit(X, y, C=c).coef_.ravel().copy() for c in cs]
print "This took ", datetime.now() - start

coefs_ = np.array(coefs_)
pl.plot(np.log10(cs), coefs_)
ymin, ymax = pl.ylim()
pl.xlabel('log(C)')
pl.ylabel('Coefficients')
pl.title('Logistic Regression Path')
pl.axis('tight')
Esempio n. 36
0
def logisticL1NestedCV(df,
                       outcomeVar,
                       predVars,
                       nFolds=10,
                       LPO=None,
                       Cs=10,
                       n_jobs=1,
                       scorer='log_loss'):
    """Apply logistic regression with L1-regularization (LASSO) to df.
    Uses nested cross-validation framework with inner folds to optimize C
    and outer test folds to evaluate performance.
        
    Parameters
    ----------
    df : pd.DataFrame
        Must contain outcome and predictor variables.
    outcomeVar : str
    predVars : ndarray or list
        Predictor variables in the model.
    nFolds : int
        N-fold stratified cross-validation
    LPO : int or None
        Use Leave-P-Out cross-validation instead of StratifiedNFoldCV
    Cs : int or list
        Each of the values in Cs describes the inverse of regularization strength.
        If Cs is as an int, then a grid of Cs values are chosen in a logarithmic
        scale between 1e-4 and 1e4. Smaller values specify stronger regularization.

    Returns
    -------
    results : dict
        Contains results as keys below:
        fpr:            (100, ) average FPR for ROC
        tpr:            (100, ) average TPR for ROC
        AUC:            (outerFolds, ) AUC of ROC for each outer test fold
        meanAUC:        (1, ) AUC of the average ROC
        ACC:            (outerFolds, ) accuracy across outer test folds
        scores:         (outerFolds, innerFolds, Cs) log-likelihood for each C across inner and outer CV folds
        optimalCs:      (outerFolds, ) optimal C from each set of inner CV
        finalResult:    final fitted model with predict() exposed
        prob:           (N,) pd.Series of predicted probabilities avg over outer folds
        varList:        (Nvars, ) list of vars with non-zero coef in final model
        Cs:             (Cs, ) pre-specified grid of Cs
        coefs:          (outerFolds, predVars) refit with optimalC in each fold
        paths:          (outerFolds, Cs, predVars + intercept) avg across inner folds
        XVars:          list of all vars in X
        yVar:           name of outcome variable
        N:              total number of rows/instances in the model"""

    if not isinstance(predVars, list):
        predVars = list(predVars)

    tmp = df[[outcomeVar] + predVars].dropna()
    X, y = tmp[predVars].astype(float), tmp[outcomeVar].astype(float)

    if np.isscalar(Cs):
        """From sklearn example:
        https://scikit-learn.org/stable/auto_examples/linear_model/plot_logistic_path.html"""
        Cs = l1_min_c(X, y, loss='log') * np.logspace(0, 7, Cs)
    elif Cs is None:
        Cs = l1_min_c(X, y, loss='log') * np.logspace(0, 7, 10)

    if LPO is None:
        innerCV = StratifiedKFold(n_splits=nFolds, shuffle=True)
        outerCV = StratifiedKFold(n_splits=nFolds, shuffle=True)
    else:
        innerCV = LeavePOut(LPO)
        outerCV = LeavePOut(LPO)

    scorerFunc = sklearn.metrics.make_scorer(sklearn.metrics.log_loss,
                                             greater_is_better=False,
                                             needs_proba=True,
                                             needs_threshold=False,
                                             labels=[0, 1])

    fpr = np.linspace(0, 1, 100)
    tpr = np.nan * np.zeros((fpr.shape[0], nFolds))
    acc = np.nan * np.zeros(nFolds)
    auc = np.nan * np.zeros(nFolds)
    paths = []
    coefs = []
    probs = []
    optimalCs = np.nan * np.zeros(nFolds)
    scores = []

    for outi, (trainInd, testInd) in enumerate(outerCV.split(X=X, y=y)):
        Xtrain, Xtest = X.iloc[trainInd], X.iloc[testInd]
        ytrain, ytest = y.iloc[trainInd], y.iloc[testInd]

        model = sklearn.linear_model.LogisticRegressionCV(Cs=Cs,
                                                          cv=innerCV,
                                                          penalty='l1',
                                                          solver='liblinear',
                                                          scoring=scorerFunc,
                                                          refit=True,
                                                          n_jobs=n_jobs)
        """With refit = True, the scores are averaged across all folds,
        and the coefs and the C that corresponds to the best score is taken,
        and a final refit is done using these parameters."""

        results = model.fit(X=Xtrain, y=ytrain)
        prob = results.predict_proba(Xtest)

        class1Ind = np.nonzero(results.classes_ == 1)[0][0]
        fprTest, tprTest, _ = sklearn.metrics.roc_curve(
            ytest, prob[:, class1Ind])

        tpr[:, outi] = np.interp(fpr, fprTest, tprTest)
        auc[outi] = sklearn.metrics.auc(fprTest, tprTest)
        acc[outi] = sklearn.metrics.accuracy_score(ytest,
                                                   np.round(prob[:,
                                                                 class1Ind]),
                                                   normalize=True)
        optimalCs[outi] = results.C_[0]
        scores.append(results.scores_[1])
        paths.append(results.coefs_paths_[1])
        coefs.append(results.coef_)
        probs.append(pd.Series(prob[:, class1Ind], index=Xtest.index))

    meanTPR = np.mean(tpr, axis=1)
    meanTPR[0], meanTPR[-1] = 0, 1
    meanACC = np.mean(acc)
    meanAUC = sklearn.metrics.auc(fpr, meanTPR)
    meanC = 10**np.mean(np.log10(optimalCs))
    paths = np.concatenate([p.mean(axis=0, keepdims=True) for p in paths],
                           axis=0)
    scores = np.concatenate([s[None, :, :] for s in scores], axis=0)
    """Compute mean probability over test predictions in CV"""
    probS = pd.concat(probs).groupby(level=0).agg(np.mean)
    probS.name = 'Prob'
    """Refit all the data with the optimal C for variable selection and 
    classification of holdout data"""
    model = sklearn.linear_model.LogisticRegression(C=meanC,
                                                    penalty='l1',
                                                    solver='liblinear')
    result = model.fit(X=X, y=y)
    varList = np.array(predVars)[result.coef_.ravel() != 0].tolist()

    rocRes = rocStats(y, np.round(probS))

    outD = {
        'fpr': fpr,  # (100, ) average FPR for ROC
        'tpr': meanTPR,  # (100, ) average TPR for ROC
        'AUC': auc,  # (outerFolds, ) AUC of ROC for each outer test fold
        'mAUC': meanAUC,  # (1, ) AUC of the average ROC
        'ACC': acc,  # (outerFolds, ) accuracy across outer test folds
        'mACC': np.mean(acc),
        'scores':
        scores,  # (outerFolds, innerFolds, Cs) score for each C across inner and outer CV folds
        'scorer': scorer,
        'optimalCs':
        optimalCs,  # (outerFolds, ) optimal C from each set of inner CV
        'C': meanC,
        'finalResult': result,  # final fitted model with predict() exposed
        'prob':
        probS,  # (N,) pd.Series of predicted probabilities avg over outer folds
        'varList': varList,  # list of vars with non-zero coef in final model
        'Cs': Cs,  # pre-specified grid of Cs
        'coefs': np.concatenate(
            coefs),  # (outerFolds, predVars) refit with optimalC in each fold
        'paths':
        paths,  # (outerFolds, Cs, predVars + intercept) avg across inner folds 
        'Xvars': predVars,
        'Yvar': outcomeVar,
        'N': tmp.shape[0]
    }
    outD.update(rocRes[['Sensitivity', 'Specificity']].to_dict())
    return outD
Esempio n. 37
0
def get_C_grid(X, y):
    c_grid = l1_min_c(X, y, loss='log') * np.logspace(0, 3, 100)
    return c_grid
from sklearn import linear_model
from sklearn import datasets
from sklearn.svm import l1_min_c

iris = datasets.load_iris()
X = iris.data
y = iris.target

X = X[y != 2]
y = y[y != 2]

X -= np.mean(X, 0)

# #############################################################################
# Demo path functions

cs = l1_min_c(X, y, loss='log') * np.logspace(0, 3)


print("Computing regularization path ...")
start = datetime.now()
clf = linear_model.LogisticRegression(penalty='l1', tol=1e-6)
coefs_ = []
for c in cs:
    clf.set_params(C=c)
    clf.fit(X, y)
    coefs_.append(clf.coef_.ravel().copy())
print("This took ", datetime.now() - start)

coefs_ = np.array(coefs_)
def reg():     
    i = datasets.make_classification(n_samples=100, n_features=2, n_informative=1, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=1, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None)
    j = datasets.make_classification(n_samples=100, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_classes=4, n_clusters_per_class=1, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None)
    k = datasets.make_classification(n_samples=100, n_features=200, n_informative=2, n_redundant=2, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None)
    
##########################################################################
############## Dataset A                     #############################
##########################################################################

    X = i[0]
    y = i[1]
    
    X = X[y != 2]
    y = y[y != 2]

    X -= np.mean(X, 0)
    
    cs = l1_min_c(X, y, loss='log') * np.logspace(0, 3)


    print "Computing regularization path 2D, 2 classes..."
    start = datetime.now()
    clf = LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
    coefs_ = []
    for c in cs:
            clf.set_params(C=c)
            clf.fit(X, y)
            coefs_.append(clf.coef_.ravel().copy())
    print "This took ", datetime.now() - start

    pl.figure()
    coefs_ = np.array(coefs_)
    pl.plot(np.log10(cs), coefs_)
    ymin, ymax = pl.ylim()
    pl.xlabel('log(C)')
    pl.ylabel('Coefficients')
    pl.title('Logistic Regression Path 2D, 2-Classes')
    pl.axis('tight')
    
    
##########################################################################
############## Dataset B                     #############################
##########################################################################
    X = j[0]
    y = j[1]
    
    X = X[y != 2]
    y = y[y != 2]

    X -= np.mean(X, 0)
    
    cs = l1_min_c(X, y, loss='log') * np.logspace(0, 3)

    print "Computing regularization path 2D, 4 classes..."
    start = datetime.now()
    clf = LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
    coefs_ = []
    for c in cs:
            clf.set_params(C=c)
            clf.fit(X, y)
            coefs_.append(clf.coef_.ravel().copy())
    print "This took ", datetime.now() - start

    pl.figure()
    coefs_ = np.array(coefs_)
    pl.plot(np.log10(cs), coefs_)
    ymin, ymax = pl.ylim()
    pl.xlabel('log(C)')
    pl.ylabel('Coefficients')
    pl.title('Logistic Regression Path 2D, 4 classes')
    pl.axis('tight')
       
##########################################################################
############## Dataset C                     #############################
##########################################################################    
    ''''X = k[0]