Ejemplo n.º 1
0
    def fit(self,
            warm_start=True,
            label=None,
            save=False,
            use_glmnet=False,
            **kwargs):
        '''
    Note: this only works with CV weights (i.e. all 0 or 1)
    '''
        if not use_glmnet:
            super(LinearRegressionModel, self).fit(warm_start, label, save,
                                                   **kwargs)

        elif use_glmnet:
            from glmnet_py import glmnet
            inds = self.example_weights.astype(np.bool)
            x = self.training_data.X[inds, :].copy()
            y = self.training_data.Y[inds].copy().astype(np.float)
            lambdau = np.array([
                self.L1Lambda / (2 * x.shape[0]),
            ])

            fit = glmnet(
                x=x[:, :-1],
                y=y,
                family='gaussian',
                standardize=True,
                lambdau=lambdau,
                thresh=1e-10,
                maxit=10e4,
                alpha=1.0,
            )
            self.params.set_free(np.append(np.squeeze(fit['beta']), fit['a0']))
Ejemplo n.º 2
0
    def fit(self,
            warm_start=True,
            label=None,
            save=False,
            use_glmnet=False,
            **kwargs):
        '''
    Note: use_glmnet only works with CV weights (i.e. all 0 or 1)
    '''

        if not use_glmnet:
            super(LogisticRegressionModel, self).fit(warm_start, label, save,
                                                     **kwargs)

        elif use_glmnet:
            from glmnet_py import glmnet
            lambdau = np.array([
                self.L1Lambda / self.training_data.X.shape[0],
            ])
            inds = self.example_weights.astype(np.bool)
            x = self.training_data.X[inds, :-1].copy()
            y = self.training_data.Y[inds].copy().astype(np.float)
            y[np.where(y == -1)] = 0.0
            fit = glmnet(
                x=x,
                y=y,
                family='binomial',
                standardize=True,
                lambdau=lambdau,
                thresh=1e-10,
                maxit=10e3,
                alpha=1.0,
            )
            self.params.set_free(np.append(fit['beta'], 0))
            return
Ejemplo n.º 3
0
    def fit(self,
            warm_start=True,
            label=None,
            save=False,
            use_glmnet=False,
            **kwargs):
        '''
    Note: use_glmnet only works with CV weights (i.e. all 0 or 1)
    '''
        if not use_glmnet:
            super(ExponentialPoissonRegressionModel,
                  self).fit(warm_start, label, save, **kwargs)

        elif use_glmnet:
            from glmnet_py import glmnet
            lambdau = np.array([
                self.L1Lambda,
            ])
            inds = self.example_weights.astype(np.bool)
            x = self.training_data.X[inds, :].copy()
            y = self.training_data.Y[inds].copy().astype(np.float)
            fit = glmnet(
                x=x,
                y=y,
                family='poisson',
                standardize=False,
                lambdau=lambdau,
                thresh=1e-20,
                maxit=10e4,
                alpha=1.0,
            )
Ejemplo n.º 4
0
  def fit(self, warm_start=True, label=None, save=False,
          use_glmnet=False, use_cvxpy=False, cvxpy_tol=1e-4,
          **kwargs):
    '''
    Note: use_cvxpy or use_glmnet only works with CV weights (i.e. all 0 or 1)
    '''
    if not use_cvxpy and not use_glmnet:
      super(ExponentialPoissonRegressionModel, self).fit(warm_start,
                                                         label,
                                                         save,
                                                         **kwargs)

    elif use_glmnet:
      from glmnet_py import glmnet
      lambdau = np.array([self.L1Lambda,])
      inds = self.example_weights.astype(np.bool)
      x = self.training_data.X[inds,:].copy()
      y = self.training_data.Y[inds].copy().astype(np.float)
      fit = glmnet(x=x,
                   y=y,
                   family='poisson',
                   standardize=False,
                   lambdau=lambdau,
                   thresh=1e-20,
                   maxit=10e4,
                   alpha=1.0,
      )

    elif use_cvxpy:
      import cvxpy
      D = self.params.get_free().shape[0]
      Ntrain = self.training_data.X.shape[0]
      theta = cvxpy.Variable(D, value=self.params.get_free())
      weights = self.example_weights

      X = self.training_data.X
      Y = self.training_data.Y
      obj = cvxpy.Minimize(-Y*cvxpy.multiply(weights,(X*(theta)))
         + cvxpy.sum(cvxpy.multiply(weights, cvxpy.exp(X*(theta))))
         + self.L1Lambda * cvxpy.sum(cvxpy.abs((theta)[:-1]))
      )
      problem = cvxpy.Problem(obj)
      problem.solve(solver=cvxpy.SCS, normalize=True,
                    eps=cvxpy_tol, verbose=False, max_iters=2000)
      if (problem.status == 'infeasible_inaccurate' or
          problem.status == 'unbounded_inaccurate'):
        problem.solve(solver=cvxpy.SCS, normalize=True,
                      eps=cvxpy_tol, verbose=False, max_iters=2000)
        problem.solve(solver=cvxpy.SCS, normalize=True,
                      eps=cvxpy_tol, verbose=False, max_iters=10000)
      try:
        self.params.set_free(theta.value)
      except:
        print('Bad problem?', problem.status)
Ejemplo n.º 5
0
    def glmnetFit(self, X, y, offsets, numCodons, numGenes, varsNames, lambda_min):
        """

        :param X:
        :param y:
        :param offsets:
        :param numCodons:
        :param numGenes:
        :param varsNames:
        :param lambda_min:
        :return:
        """
        # fit the model
        if not lambda_min:
            fit = cvglmnet(x=X.copy(), y=y.copy(), family='poisson',
                           offset=offsets, alpha=0, parallel=True, lambda_min=np.array([0]))
            coefs = cvglmnetCoef(fit, s=fit['lambda_min'])  # lambda_min lambda_1se
        else:
            fit = glmnet(x=X.copy(), y=y.copy(), family='poisson',
                         offset=offsets, alpha=0, lambda_min=np.array([0]))
            coefs = glmnetCoef(fit, s=scipy.float64([lambda_min]))
        # parse and scale coefficients
        intercept = coefs[0][0]
        geneBetas = pd.DataFrame([[varsNames[i-1].split("_")[1], coefs[i][0]]
                                  for i in range(1, numGenes+1)], columns=["gene", "beta"])
        geneBetas["log2_TE"] = (geneBetas["beta"] - np.median(geneBetas["beta"])) / np.log(2)
        geneBetas.drop(["beta"], inplace=True, axis=1)
        codonBetas = pd.DataFrame([[varsNames[i-1].split("_")[1], coefs[i][0]]
                                   for i in range(numGenes+1, numGenes + numCodons + 1)], columns=["codon", "beta"])
        codonBetas["log_codon_dwell_time"] = (codonBetas["beta"] - np.median(codonBetas["beta"]))
        codonBetas["codon_dwell_time"] = np.exp(codonBetas["log_codon_dwell_time"])
        codonBetas.drop(["beta", "log_codon_dwell_time"], inplace=True, axis=1)
        downstreamSLBeta = coefs[numGenes + numCodons + 1][0]
        #  export to local
        geneBetas.to_csv(path_or_buf=self.output + '/genesTE.csv', sep='\t',
                         header=True, index=False, float_format='%.4f')
        codonBetas.to_csv(path_or_buf=self.output + '/codons.csv', sep='\t',
                          header=True, index=False, float_format='%.4f')
        # print results
        if lambda_min:
            sys.stderr.write("[results]\tpre-defined lambda: " + str(lambda_min) + "\n")
        else:
            sys.stderr.write("[results]\tlambda that gives minimum mean cv error: " + str(fit['lambda_min']) + "\n")
            sys.stderr.write("[results]\tlambda 1 se away: " + str(fit['lambda_1se']) + "\n")
        sys.stderr.write("[results]\tintercept: " + str(intercept) + "\n")
        sys.stderr.write("[results]\tbetas for 2' structure windows: " + str(downstreamSLBeta) + "\n")
        # plot
        if not lambda_min:
            plt.figure()
            cvglmnetPlot(fit)
            plt.gcf()
            plt.savefig(self.output + "/" + "lambda_cv.pdf")
            plt.clf()
Ejemplo n.º 6
0
    def _process( self, raw_data, regression_var ):
        """ Take in pre-processed data, run a regression via glmnet
         and return the results.

        Args:
            raw_data: A pandas.DataFrame that contains both the design matrix and
            vector of predictors along with any labels.
            regression_var: Index of the column in raw_data that contains the vector
            of predictors.

        Returns:
            A pandas.DataFrame containing the regression coefficients corresponding
            to the support ( that is non-zero and signigiant coefficients ) along
            with the intercept term.
        """

        reg_idx = int( regression_var )

        Y = raw_data.ix[:,reg_idx]
        X = raw_data.ix[:, raw_data.columns != raw_data.columns[reg_idx] ]

        col_names = list( raw_data.columns.values )

        Y = Y.as_matrix().astype(np.float64)
        X = X.as_matrix().astype(np.float64)

        C = 0.75

        lamdbaMax = 2 * max( abs( np.dot( np.transpose( X ), Y )  ) ) / Y.size

        lambdaGrid = np.arange( 0, 100 )
        lambdaGrid = np.apply_along_axis( lambda x: lamdbaMax / 1.3**x, 0, lambdaGrid )

        betas = glmnet_py.glmnetCoef( glmnet_py.glmnet( x = X.copy(), y = Y.copy(), lambdau = lambdaGrid ) )

        j = 0
        t = 1

        while( t > 0 and j < 99 ):
            j += 1
            beta_j = betas[:,j]
            lambda_j = lambdaGrid[j]

            for k in np.arange( 1, ( j - 1 ) ) :
                beta_k = betas[:,k]
                lambda_k = lambdaGrid[k]
                t = t * (max(abs(beta_j - beta_k)) / (lambda_j + lambda_k) <= C)

        beta = betas[:,j]

        # fit['beta'] = fit['beta'] * (abs(fit['beta']) >= 3 * C * fit['lambdau'])
        # beta = beta * (abs( beta ) >= 3 * C * lambdaGrid[j] )

        coefficients = beta

        nz_indices = coefficients.nonzero()[0]
        support_coefs = coefficients[ nz_indices ]

        # cvglmnetCoef places the intercept term in the first position of the
        # coefficients vector -- need to manually add Intercept name to  vector of column names
        col_names.insert( 0, "Intercept" )
        col_names = [ col_names[idx] for idx in nz_indices ]

        return( pd.DataFrame( data = support_coefs , index = col_names ).to_json(orient='columns') )
Ejemplo n.º 7
0
all_counts = count_vect.transform(pt_data['notes'].note_text)
all_counts = scipy.sparse.csc_matrix(all_counts).astype('float64')
note_sums = scipy.sparse.csc_matrix(
    pt_data['notes'].word_count).transpose().astype('float64')
all_counts = all_counts.multiply(note_sums.power(-1))
feature_sums = scipy.sparse.csc_matrix(all_counts.sum(axis=0))
all_counts = all_counts.multiply(feature_sums.power(-1))
all_counts = scipy.sparse.csc_matrix(all_counts)

from glmnet_py import glmnet
from glmnetPrint import glmnetPrint
from glmnetCoef import glmnetCoef
from glmnetPredict import glmnetPredict

fit = glmnet(x=all_counts[train_sample, :],
             y=pt_data['notes'].loc[train_sample].surv_12mo.values * 1.0,
             family='binomial',
             alpha=1.0)

chunksize = 1000
num_s = fit['lambdau'].shape[0]
predictions = np.zeros([len(test_sample), num_s])
for i in range(int(len(test_sample) / chunksize)):  #looping avoids MemoryError
    predictions[(i * chunksize):((i + 1) * chunksize), :] = glmnetPredict(
        fit,
        all_counts[test_sample[(i * chunksize):((i + 1) * chunksize)], :],
        ptype='response')

predictions[((i + 1) * chunksize):, :] = glmnetPredict(
    fit, all_counts[test_sample[((i + 1) * chunksize):], :], ptype='response')

for i in range(num_s):