Esempio n. 1
0
def test_framing_example():

    cur_dir = os.path.dirname(os.path.abspath(__file__))
    data = pd.read_csv(os.path.join(cur_dir, 'results', "framing.csv"))

    outcome = np.asarray(data["cong_mesg"])
    outcome_exog = patsy.dmatrix("emo + treat + age + educ + gender + income", data,
                                  return_type='dataframe')
    probit = sm.families.links.probit
    outcome_model = sm.GLM(outcome, outcome_exog, family=sm.families.Binomial(link=probit()))

    mediator = np.asarray(data["emo"])
    mediator_exog = patsy.dmatrix("treat + age + educ + gender + income", data,
                                 return_type='dataframe')
    mediator_model = sm.OLS(mediator, mediator_exog)

    tx_pos = [outcome_exog.columns.tolist().index("treat"),
              mediator_exog.columns.tolist().index("treat")]
    med_pos = outcome_exog.columns.tolist().index("emo")

    med = Mediation(outcome_model, mediator_model, tx_pos, med_pos,
                    outcome_fit_kwargs={'atol':1e-11})

    np.random.seed(4231)
    para_rslt = med.fit(method='parametric', n_rep=100)
    diff = np.asarray(para_rslt.summary() - framing_para_4231)
    assert_allclose(diff, 0, atol=1e-6)

    np.random.seed(4231)
    boot_rslt = med.fit(method='boot', n_rep=100)
    diff = np.asarray(boot_rslt.summary() - framing_boot_4231)
    assert_allclose(diff, 0, atol=1e-6)
Esempio n. 2
0
def lasso_cv(x, y, x_pred=None, max_deg=3, cv=10, max_iter=1e3, return_model=False):
    """LASSO polynomial fit with cross-validation.
    
    Regularized polynomial regression (by penalized least-squares) from a
    range of degrees up to n = max_deg. The LASSO regression minimises MSE and
    penalizes the size of the parameter vector using L1-norm, which leads to
    fewer coefficients in the fitted model.

    - The 'alpha' parameter (amount of penalization) is selected by k-fold CV.
    - Predicts fitted model on given values 'x_pred' (default use 'x').
    - Supports NaNs.

    """
    ind, = np.where((~np.isnan(x)) & (~np.isnan(y)))
    x_, y_ = x[ind], y[ind]
    X_ = dmatrix('C(x_, Poly)')
    if x_pred is None:
        X = dmatrix('C(x, Poly)')      # predict on original values
    else:
        X = dmatrix('C(x_pred, Poly)') # predict on given values
    lasso = LassoCV(cv=cv, copy_X=True, normalize=True, max_iter=max_iter)
    lasso = lasso.fit(X_[:,1:max_deg+1], y_)
    y_pred = lasso.predict(X[:,1:max_deg+1])
    if return_model:
        y_pred = [y_pred, lasso]
    return y_pred
Esempio n. 3
0
def plot_results(band, yatsm_config, yatsm_model, plot_type='TS'):
    step = -1 if yatsm_config['reverse'] else 1
    design = re.sub(r'[\+\-][\ ]+C\(.*\)', '', yatsm_config['design_matrix'])

    for i, r in enumerate(yatsm_model.record):
        label = 'Model {i}'.format(i=i)
        if plot_type == 'TS':
            mx = np.arange(r['start'], r['end'], step)
            mX = patsy.dmatrix(design, {'x': mx}).T

            my = np.dot(r['coef'][:, band], mX)
            mx_date = np.array([dt.datetime.fromordinal(int(_x)) for _x in mx])

        elif plot_type == 'DOY':
            yr_end = dt.datetime.fromordinal(r['end']).year
            yr_start = dt.datetime.fromordinal(r['start']).year
            yr_mid = int(yr_end - (yr_end - yr_start) / 2)

            mx = np.arange(dt.date(yr_mid, 1, 1).toordinal(),
                           dt.date(yr_mid + 1, 1, 1).toordinal(), 1)
            mX = patsy.dmatrix(design, {'x': mx}).T

            my = np.dot(r['coef'][:, band], mX)
            mx_date = np.array([dt.datetime.fromordinal(d).timetuple().tm_yday
                                for d in mx])

            label = 'Model {i} - {yr}'.format(i=i, yr=yr_mid)

        plt.plot(mx_date, my, lw=2, label=label)
        plt.legend()
Esempio n. 4
0
def design_formula(train_metadata, test_metadata, formula):
    """ Generate and align two design matrices.

    Parameters
    ----------
    train_metadata : pd.DataFrame
        Training metadata
    test_metadata : pd.DataFrame
        Testing metadata
    formula : str
        Statistical formula specifying design matrix

    Return
    ------
    train_design : pd.DataFrame
        Train design matrix
    test_design : pd.DataFrame
        Test design matrix
    """
    train_design = dmatrix(formula, train_metadata,
                           return_type='dataframe')
    test_design = dmatrix(formula, test_metadata,
                          return_type='dataframe')

    # pad extra columns with zeros, so that we can still make predictions
    extra_columns = list(set(train_design.columns) -
                         set(test_design.columns))
    df = pd.DataFrame({C: np.zeros(test_design.shape[0])
                       for C in extra_columns},
                      index=test_design.index)
    test_design = pd.concat((test_design, df), axis=1)
    test_design = test_design.reindex(columns=train_design.columns)
    return train_design, test_design
Esempio n. 5
0
def test_framing_example_moderator():
    # moderation without formulas, generally not useful but test anyway

    cur_dir = os.path.dirname(os.path.abspath(__file__))
    data = pd.read_csv(os.path.join(cur_dir, 'results', "framing.csv"))

    outcome = np.asarray(data["cong_mesg"])
    outcome_exog = patsy.dmatrix("emo + treat + age + educ + gender + income", data,
                                  return_type='dataframe')
    probit = sm.families.links.probit
    outcome_model = sm.GLM(outcome, outcome_exog, family=sm.families.Binomial(link=probit()))

    mediator = np.asarray(data["emo"])
    mediator_exog = patsy.dmatrix("treat + age + educ + gender + income", data,
                                 return_type='dataframe')
    mediator_model = sm.OLS(mediator, mediator_exog)

    tx_pos = [outcome_exog.columns.tolist().index("treat"),
              mediator_exog.columns.tolist().index("treat")]
    med_pos = outcome_exog.columns.tolist().index("emo")

    ix = (outcome_exog.columns.tolist().index("age"),
          mediator_exog.columns.tolist().index("age"))
    moderators = {ix : 20}
    med = Mediation(outcome_model, mediator_model, tx_pos, med_pos,
                    moderators=moderators)

    # Just a smoke test
    np.random.seed(4231)
    med_rslt = med.fit(method='parametric', n_rep=100)
Esempio n. 6
0
def lr_tests(ds, alt_model, null_model='~ 1', gene_column='Gene', batch_size=2000, transformation=np.log1p, rcond=-1):
    '''
    Compare alt_model and null_model by a likelihood ratio test for every gene in ds.

    Args:
        ds (LoomConnection):    Dataset
        alt_model (str):    Formula describing alternative model
        null_model (str):   Formula describing null model
        gene_column (str):  Name of the gene labels to use in the ds (default "Gene")
        batch_size (int):   The number of genes to read from disk in each iteration (default 2000)
        transformation (function):  Transformation to apply to expression values before fitting (default np.log1p)
        rcond (float):  Conditioning for the least square fitting (default -1, which has no effect)

    Returns:
        results (DataFrame):    Dataframe with model parameter estimates for each gene, with P values from LRT.
    '''
    sample_info = pd.DataFrame()
    for k in ds.ca.keys():
        sample_info[k] = ds.ca[k]
    
    alt_design = patsy.dmatrix(alt_model, sample_info, return_type='dataframe')
    null_design = patsy.dmatrix(null_model, sample_info, return_type='dataframe')

    n = ds.shape[1]

    genes = []
    betas = []
    pvals = []

    total_batches = np.ceil(ds.shape[0] / batch_size).astype(int)
    for (ix, selection, vals) in tqdm(ds.scan(axis=0, batch_size=batch_size), total=total_batches):
        expression_matrix = transformation(vals[:, :])
        beta_alt, res_alt, rank_alt, s_alt = np.linalg.lstsq(alt_design, expression_matrix.T, rcond=rcond)
        beta_null, res_null, rank_null, s_null = np.linalg.lstsq(null_design, expression_matrix.T, rcond=rcond)

        genes.append(vals.ra[gene_column])

        ll_alt  = -n / 2. * np.log(2 * np.pi) - n / 2. * np.ma.log(res_alt  / n) - n / 2.
        ll_null = -n / 2. * np.log(2 * np.pi) - n / 2. * np.ma.log(res_null / n) - n / 2.

        llr = ll_alt - ll_null

        pval = stats.chi2.sf(2 * llr, df=(beta_alt.shape[0] - beta_null.shape[0]))
        pval = np.ma.MaskedArray(pval, mask=llr.mask).filled(1.)

        betas.append(beta_alt)
        pvals.append(pval)

    results = pd.DataFrame({gene_column: np.hstack(genes)})

    for name, beta in zip(alt_design.columns, np.hstack(betas)):
        results[name] = beta

    results['pval'] = np.hstack(pvals)

    min_pval = results.pval[results.pval != 0].min()
    results['pval'] = results.pval.clip_lower(min_pval)

    return results
Esempio n. 7
0
def test_patsy_577():
    X = np.random.random((10, 2))
    df = pandas.DataFrame(X, columns=["var1", "var2"])
    from patsy import dmatrix
    endog = dmatrix("var1 - 1", df)
    np.testing.assert_(data._is_using_patsy(endog, None))
    exog = dmatrix("var2 - 1", df)
    np.testing.assert_(data._is_using_patsy(endog, exog))
Esempio n. 8
0
def plot_results(band, cfg, model, design_info, plot_type='TS'):
    """ Plot model results

    Args:
        band (int): plot results for this band
        cfg (dict): YATSM configuration dictionary
        model (YATSM model): fitted YATSM timeseries model
        design_info (patsy.DesignInfo): patsy design information
        plot_type (str): type of plot to add results to (TS, DOY, or VAL)
    """
    # Handle reverse
    step = -1 if cfg['YATSM']['reverse'] else 1

    # Remove categorical info from predictions
    design = re.sub(r'[\+\-][\ ]+C\(.*\)', '',
                    cfg['YATSM']['design_matrix'])

    i_coef = []
    for k, v in design_info.column_name_indexes.iteritems():
        if not re.match('C\(.*\)', k):
            i_coef.append(v)
    i_coef = np.asarray(i_coef)

    for i, r in enumerate(model.record):
        label = 'Model {i}'.format(i=i)
        if plot_type == 'TS':
            # Prediction
            mx = np.arange(r['start'], r['end'], step)
            mX = patsy.dmatrix(design, {'x': mx}).T

            my = np.dot(r['coef'][i_coef, band], mX)
            mx_date = np.array([dt.datetime.fromordinal(int(_x)) for _x in mx])
            # Break
            if r['break']:
                bx = dt.datetime.fromordinal(r['break'])
                plt.axvline(bx, c='red', lw=2)

        elif plot_type in ('DOY', 'VAL'):
            yr_end = dt.datetime.fromordinal(r['end']).year
            yr_start = dt.datetime.fromordinal(r['start']).year
            yr_mid = int(yr_end - (yr_end - yr_start) / 2)

            mx = np.arange(dt.date(yr_mid, 1, 1).toordinal(),
                           dt.date(yr_mid + 1, 1, 1).toordinal(), 1)
            mX = patsy.dmatrix(design, {'x': mx}).T

            my = np.dot(r['coef'][i_coef, band], mX)
            mx_date = np.array([dt.datetime.fromordinal(d).timetuple().tm_yday
                                for d in mx])

            label = 'Model {i} - {yr}'.format(i=i, yr=yr_mid)

        plt.plot(mx_date, my, lw=2, label=label)
    leg = plt.legend()
    leg.draggable(state=True)
    def covariance(self, time, scale_params, smooth_params, scale_data,
                   smooth_data):
        """
        Returns a Gaussian process covariance matrix.

        Parameters
        ----------
        time : array-like
            The time points at which the fitted covariance matrix is
            calculated.
        scale_params : array-like
            The regression parameters for the scaling part
            of the covariance structure.
        smooth_params : array-like
            The regression parameters for the smoothing part
            of the covariance structure.
        scale_data : Dataframe
            The data used to determine the scale parameter,
            must have len(time) rows.
        smooth_data: Dataframe
            The data used to determine the smoothness parameter,
            must have len(time) rows.

        Returns
        -------
        A covariance matrix.

        Notes
        -----
        If the model was fit using formulas, `scale` and `smooth` should
        be Dataframes, containing all variables that were present in the
        respective scaling and smoothing formulas used to fit the model.
        Otherwise, `scale` and `smooth` should contain data arrays whose
        columns align with the fitted scaling and smoothing parameters.

        The covariance is only for the Gaussian process and does not include
        the white noise variance.
        """

        if not hasattr(self.data, "scale_design_info"):
            sca = np.dot(scale_data, scale_params)
            smo = np.dot(smooth_data, smooth_params)
        else:
            sc = patsy.dmatrix(self.data.scale_design_info, scale_data)
            sm = patsy.dmatrix(self.data.smooth_design_info, smooth_data)
            sca = np.exp(np.dot(sc, scale_params))
            smo = np.exp(np.dot(sm, smooth_params))

        return self.cov.get_cov(time, sca, smo)
Esempio n. 10
0
    def transform(self, data):
        """Transform with estimator using formula.

        Transform the data using formula, then transform it
        using the estimator.

        Parameters
        ----------
        data : dict-like (pandas dataframe)
            Input data. Column names need to match variables in formula.
        """
        if self.return_type == 'dataframe':
            return dmatrix(self.design_, data, return_type='dataframe')
        else:
            return np.array(dmatrix(self.design_, data))
Esempio n. 11
0
def regress_out(sample_info, expression_matrix, covariate_formula, design_formula='1'):
    ''' Implementation of limma's removeBatchEffect function
    '''
    # Ensure intercept is not part of covariates
    covariate_formula += ' - 1'
    covariate_matrix = patsy.dmatrix(covariate_formula, sample_info)
    design_matrix = patsy.dmatrix(design_formula, sample_info)
    
    design_batch = np.hstack((design_matrix, covariate_matrix))

    coefficients, res, rank, s = np.linalg.lstsq(design_batch, expression_matrix.T)
    beta = coefficients[-design_matrix.shape[1]][:, None]
    regressed = expression_matrix - beta.dot(covariate_matrix.T)

    return regressed
	def fitMinSpline(self, Yvar, Xvar, smoothingWindow, plot=False, plotVar = None):
	    '''
            This function is to fit/interpolate a spline in the data
            '''
            # use patsy class to define a matrix
            X = np.asarray(patsy.dmatrix("cr(x, df=7)-1", {"x": Xvar}))
	    # redefine dataframe
            modDat = pd.DataFrame(X, index=Yvar.index)
	    # redefine our data into X1-X7
            modDat.columns = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7']
	    modDatTrunc = modDat.iloc[self._smoothingWindow/2:-self._smoothingWindow/2].copy()
	    window = np.ones(self._smoothingWindow)/float(self._smoothingWindow)
	    modDatTrunc['Y'] = np.convolve(Yvar, window, 'same')[self._smoothingWindow/2:-self._smoothingWindow/2]
	    mod = smf.quantreg('Y~X1+X2+X3+X4+X5', modDatTrunc)
	    res = mod.fit(q=0.01)
	    preds = pd.Series(res.predict(modDat), index = Xvar.index)
	    if plot:
	        plotDF = pd.concat([plotVar, Yvar, preds],1)
	        print(plotDF.columns)
	        plotDF.columns = [plotVar.name, Yvar.name, 'fitted']
	        p = ggplot(aes(x=plotVar.name, y=Yvar.name), data=plotDF) + geom_line() +\
	            geom_line(aes(y='fitted'), color='red')+\
	            ylim(0,5) +\
	            xlab('') + ylab('Sensor (V)')
	        print(p)
                #return regression predictors
	    return(preds)
Esempio n. 13
0
    def test_predict_formula(self):

        n = 100
        np.random.seed(34234)
        time = 50 * np.random.uniform(size=n)
        status = np.random.randint(0, 2, n).astype(np.float64)
        exog = np.random.uniform(1, 2, size=(n, 2))

        df = pd.DataFrame({"time": time, "status": status,
                           "exog1": exog[:, 0], "exog2": exog[:, 1]})

        fml = "time ~ 0 + exog1 + np.log(exog2) + exog1*exog2"
        model1 = PHReg.from_formula(fml, df, status=status)
        result1 = model1.fit()

        from patsy import dmatrix
        dfp = dmatrix(model1.data.design_info.builder, df)

        pr1 = result1.predict()
        pr2 = result1.predict(exog=df)
        pr3 = model1.predict(result1.params, exog=dfp) # No standard errors
        pr4 = model1.predict(result1.params, cov_params=result1.cov_params(), exog=dfp)

        prl = (pr1, pr2, pr3, pr4)
        for i in range(4):
            for j in range(i):
                assert_allclose(prl[i].predicted_values, prl[j].predicted_values)

        prl = (pr1, pr2, pr4)
        for i in range(3):
            for j in range(i):
                assert_allclose(prl[i].standard_errors, prl[j].standard_errors)
Esempio n. 14
0
def xtab(formula, covariate_df):
    y, X = patsy.dmatrices(str(formula), covariate_df)
    X = patsy.dmatrix('genotype', covariate_df)
    ix = get_genotype_ix(X)

    tbl = pd.crosstab(X[:, ix], y.ravel())
    try:
        tbl.columns = ['%s_%i' % (y.design_info.column_names[-1], j) for j in range(2)]
    except:
        return None # too few samples
    tbl.index = ['%i_alts' % i for i in tbl.index]
    alts = set(tbl.index)
    if len(alts) < 2 or not '0_alts' in alts:
        tbl_dom = None
    else:
        tbl_dom = pd.DataFrame({'0_alts': tbl.ix['0_alts', :], 'n_alts': tbl.ix[list(alts - set(['0_alts'])), :].sum()}).T

    # can't test recessive without any homoz alts.
    if not '2_alts' in alts or len(alts) < 2:
        tbl_rec = None
    else:
        tbl_rec = pd.DataFrame({'lt2_alts': tbl.ix[['0_alts', '1_alts'], :].sum(), '2_alts': tbl.ix['2_alts', :]})

    d = {}
    for name, xtbl in (('additive', tbl), ('dominant', tbl_dom), ('recessive', tbl_rec)):
        if xtbl is None:

            d['p.chi.%s' % name] =  'nan'
            continue

        chi, p, ddof, e = chi2_contingency(xtbl)
        if name == 'additive':
            d = xtbl.to_dict()
        d['p.chi.%s' % name] = "%.3g" % p
    return d
Esempio n. 15
0
def test_mnl_estimation(obs, alts):
    """
    Confirm that estimated params from the new interface match urbansim.urbanchoice.
    Only runs if the urbansim package has been installed.
    
    """
    try:
        from urbansim.urbanchoice.mnl import mnl_estimate
    except:
        print("Comparison of MNL estimation results skipped because urbansim is not installed")
        return

    model_expression = 'obsval + altval - 1'
    mct = MergedChoiceTable(obs, alts, 'choice')
    
    # new interface
    m = MultinomialLogit(mct, model_expression)
    r = m.fit().get_raw_results()
    
    # old interface
    dm = dmatrix(model_expression, mct.to_frame())
    chosen = np.reshape(mct.to_frame()[mct.choice_col].values, (100, 5))
    log_lik, fit = mnl_estimate(np.array(dm), chosen, numalts=5)
    
    for k,v in log_lik.items():
        assert(v == pytest.approx(r['log_likelihood'][k], 0.00001))
    
    assert_frame_equal(fit, r['fit_parameters'][['Coefficient', 'Std. Error', 'T-Score']])
Esempio n. 16
0
def main():
    train_df_filled=fill_null_vals(train_df,'Fare')
    train_df_filled=fill_null_vals(train_df_filled,'Age')
    assert len(train_df_filled)==len(train_df)
    
    test_df_filled=fill_null_vals(test_df,'Fare')
    test_df_filled=fill_null_vals(test_df_filled,'Age')
    assert len(test_df_filled)==len(test_df)

    

    for formula_name, formula in formula_map.iteritems():

        print "name=%s formula=%s" % (formula_name,formula)

        y_train,X_train = dmatrices('Survived ~ ' + formula, 
                                    train_df_filled,return_type='dataframe')
        print "Running DecisionTreeClassifier with formula : %s" % formula
        print "X_train cols=%s " % X_train.columns
        y_train = np.ravel(y_train)
        model = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3,min_samples_leaf=5)
        print "About to fit..."
        dt_model = model.fit(X_train, y_train)
        print "Training score:%s" % dt_model.score(X_train,y_train)
        X_test=dmatrix(formula,test_df_filled)
        predicted=dt_model.predict(X_test)
        print "predicted:%s" % predicted[:5]
        assert len(predicted)==len(test_df)
        pred_results=pd.Series(predicted,name='Survived')
        dt_results=pd.concat([test_df['PassengerId'],pred_results],axis=1)
        dt_results.Survived=dt_results.Survived.astype(int)
        results_file='csv/dt_%s.csv' % (formula_name)
        print "output file: %s\n" % results_file
        #results_file = re.sub('[+ ()C]','',results_file)
        dt_results.to_csv(results_file,index=False)
Esempio n. 17
0
def _temp_plot(dates, Y_seg_mean, Y_seg_std, Y_seg_stderr, Y_seg_mask,
               seg_id, plot_idx, results=None):
    import matplotlib.pyplot as plt

    seg_id -= 1
    plot_idx = 5
    plt.subplot(3, 1, 1)
    plt.plot(dates[Y_seg_mask[seg_id, :]],
             Y_seg_mean[seg_id, plot_idx, Y_seg_mask[seg_id, :]], 'ro')
    plt.ylabel('Mean idx {i}'.format(i=plot_idx))

    plt.subplot(3, 1, 2)
    plt.plot(dates[Y_seg_mask[seg_id, :]],
             Y_seg_std[seg_id, plot_idx, Y_seg_mask[seg_id, :]], 'ro')
    plt.ylabel('Std idx {i}'.format(i=plot_idx))

    plt.subplot(3, 1, 3)
    plt.errorbar(dates[Y_seg_mask[seg_id, :]],
                 Y_seg_mean[seg_id, plot_idx, Y_seg_mask[seg_id, :]],
                 yerr=Y_seg_stderr[seg_id, plot_idx, Y_seg_mask[seg_id, :]],
                 fmt='o')
    plt.ylabel('Mean/stderr idx {i}'.format(i=plot_idx))

    if results is not None:
        for i, r in enumerate(results.record):
            mx = np.arange(r['start'], r['end'], 1)
            from IPython.core.debugger import Pdb
            Pdb().set_trace()
            mX = patsy.dmatrix(results.design_info, {'x': mx})
            my = np.dot(r['coef'][:, plot_idx], mX)

            # dates =

    plt.show()
def fitMinSpline(Yvar, Xvar, smoothingWindow, plot=False, plotVar = None):
    '''
    Function returns minimal interpolation spline
    Inputs:
    Yvar : dependent variables that needed to be fit
    Xvar : independent variables that needed to be fit
    smoothingWindow : the smoothing time average
    plot = boolean value to plot or not, default is not to plot
    plotVar = plot a specific variable, default none
    '''
    X = np.asarray(patsy.dmatrix("cr(x, df=7)-1", {"x": Xvar}))
    modDat = pd.DataFrame(X, index=Yvar.index)
    modDat.columns = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7']
    modDatTrunc = modDat.iloc[smoothingWindow/2:-smoothingWindow/2].copy()
    window = np.ones(smoothingWindow)/float(smoothingWindow)
    modDatTrunc['Y'] = np.convolve(Yvar, window, 'same')[smoothingWindow/2:-smoothingWindow/2]
    mod = smf.quantreg('Y~X1+X2+X3+X4+X5', modDatTrunc)
    res = mod.fit(q=0.01)
    preds = pd.Series(res.predict(modDat), index = Xvar.index)
    if plot:
        plotDF = pd.concat([plotVar, Yvar, preds],1)
        print(plotDF.columns)
        plotDF.columns = [plotVar.name, Yvar.name, 'fitted']
        p = ggplot(aes(x=plotVar.name, y=Yvar.name), data=plotDF) + geom_line() +\
            geom_line(aes(y='fitted'), color='red')+\
            ylim(0,5) +\
            xlab('') + ylab('Sensor (V)')
        print(p)
    return(preds)
Esempio n. 19
0
    def setup_class(cls):

        sp = np.array([40491.3940640059, 232455.530262537])
        # s_scale is same as before
        cls.s_scale = s_scale = np.array([2.443955e-06, 0.007945455])

        cls.exog = patsy.dmatrix('fuel + drive', data=df_autos)

        x_spline = df_autos[['weight', 'hp']].values
        bs = BSplines(x_spline, df=[12, 10], degree=[3, 3],
                      variable_names=['weight', 'hp'],
                      constraints='center',
                      include_intercept=True)

        alpha0 = 1 / s_scale * sp / 2
        gam_bs = GLMGam.from_formula('city_mpg ~ fuel + drive', df_autos,
                                     smoother=bs, family=family.Poisson(),
                                     alpha=alpha0)

        cls.res1a = gam_bs.fit(use_t=False)

        cls.res1b = gam_bs.fit(method='newton', use_t=True)
        cls.res1 = cls.res1a._results
        cls.res2 = results_mpg_bs_poisson.mpg_bs_poisson

        cls.rtol_fitted = 1e-8
        cls.covp_corrfact = 1  # not needed
Esempio n. 20
0
def hsalinarum_replicate_data():
	import patsy

	data = pd.read_excel("data/hsalinarum/Raw_growth_data2.xlsx",sheetname='Raw data (OD600)SLIM')
	# time = np.arange(4,48,.5)
	time = np.arange(4,48,4)

	# temp = data[(data.Condition.isnull()) & ((data.Strain == 'ura3') | (data.Strain=='trmB') | (data.Strain == 'rosR'))]
	# temp = data[(data.Condition.isnull()) & ((data.Strain=='trmB') | (data.Strain == 'rosR'))]
	# temp = data[(data.Condition.isnull()) & ((data.Strain=='trmB') | (data.Strain == 'rosR') | (data.Strain == 'trh2'))]
	temp = data[(data.Condition.isnull()) & ((data.Strain == 'ura3'))]
	# temp = data[(data.Condition.isnull()) & (data.Strain!='ura3')]

	temp.Condition[temp.Condition.isnull()] = ''

	y = temp[time].T.values
	y = np.log2(y)
	y = y - y[0,:]

	x = time
	x = (x-x.mean())/x.std()
	x = x[:,None]

	effect = patsy.dmatrix('C(Experiment):C(Well)+0',temp)
	effect = np.where(effect!=0)[1][:,None]

	return x,y,effect
Esempio n. 21
0
def test_mnl_prediction(obs, alts):
    """
    Confirm that fitted probabilities in the new codebase match urbansim.urbanchoice.
    Only runs if the urbansim package has been installed.
    
    """
    try:
        from urbansim.urbanchoice.mnl import mnl_simulate
    except:
        print("Comparison of MNL simulation results skipped because urbansim is not installed")
        return

    # produce a fitted model
    mct = MergedChoiceTable(obs, alts, 'choice', 5)
    m = MultinomialLogit(mct, model_expression='obsval + altval - 1')
    results = m.fit()
    
    # get predicted probabilities using choicemodels
    probs1 = results.probabilities(mct)
    
    # compare to probabilities from urbansim.urbanchoice
    dm = dmatrix(results.model_expression, data=mct.to_frame(), return_type='dataframe')

    probs = mnl_simulate(data=dm, coeff=results.fitted_parameters,
                         numalts=mct.sample_size, returnprobs=True)

    df = mct.to_frame()
    df['prob'] = probs.flatten()
    probs2 = df.prob
    
    pd.testing.assert_series_equal(probs1, probs2)
Esempio n. 22
0
    def setup_class(cls):

        sp = np.array([0.830689464223685, 425.361212061649])
        cls.s_scale = s_scale = np.array([2.443955e-06, 0.007945455])

        x_spline = df_autos[['weight', 'hp']].values
        # We need asarray to remove the design_info
        # If design_info is attached,
        #     then exog_linear will also be transformed in predict.
        cls.exog = np.asarray(patsy.dmatrix('fuel + drive', data=df_autos))
        bs = BSplines(x_spline, df=[12, 10], degree=[3, 3],
                      variable_names=['weight', 'hp'],
                      constraints='center',
                      include_intercept=True)
        # TODO alpha needs to be list
        alpha0 = 1 / s_scale * sp / 2
        gam_bs = GLMGam(df_autos['city_mpg'], exog=cls.exog, smoother=bs,
                        alpha=(alpha0).tolist())
        cls.res1a = gam_bs.fit(use_t=True)

        cls.res1b = gam_bs.fit(method='newton', use_t=True)
        cls.res1 = cls.res1a._results
        cls.res2 = results_mpg_bs.mpg_bs

        cls.rtol_fitted = 1e-8
        cls.covp_corrfact = 1  # not needed

        # for checking that alpha model attribute is unchanged, same as alpha0
        cls.alpha = [169947.78222669504, 26767.58046340008]
Esempio n. 23
0
def main():
    train_df_filled=fill_null_vals(train_df,'Fare')
    train_df_filled=fill_null_vals(train_df_filled,'Age')
    assert len(train_df_filled)==len(train_df)
    
    test_df_filled=fill_null_vals(test_df,'Fare')
    test_df_filled=fill_null_vals(test_df_filled,'Age')
    assert len(test_df_filled)==len(test_df)

    
    num_estimators=10000
    for formula_name, formula in formula_map.iteritems():

        print "name=%s formula=%s" % (formula_name,formula)

        y_train,X_train = dmatrices('Survived ~ ' + formula, 
                                    train_df_filled,return_type='dataframe')
        print "Running RandomForestClassifier with formula : %s" % formula
        print "X_train cols=%s " % X_train.columns
        y_train = np.ravel(y_train)
        model = RandomForestClassifier(n_estimators=num_estimators, random_state=0)
        print "About to fit..."
        rf_model = model.fit(X_train, y_train)
        print "Training score:%s" % rf_model.score(X_train,y_train)
        X_test=dmatrix(formula,test_df_filled)
        predicted=rf_model.predict(X_test)
        print "predicted:%s" % predicted[:5]
        assert len(predicted)==len(test_df)
        pred_results=pd.Series(predicted,name='Survived')
        rf_results=pd.concat([test_df['PassengerId'],pred_results],axis=1)
        rf_results.Survived=rf_results.Survived.astype(int)
        results_file='csv/rf_%s_n_est_%s.csv' % (formula_name,num_estimators)
        print "output file: %s\n" % results_file
        #results_file = re.sub('[+ ()C]','',results_file)
        rf_results.to_csv(results_file,index=False)
Esempio n. 24
0
def hsalinarum_data():
	import patsy

	data = pd.read_excel("data/hsalinarum/Raw_growth_data2.xlsx",sheetname='Raw data (OD600)SLIM')
	# time = np.arange(4,48,.5)
	time = np.arange(4,48,2)

	# temp = data[(data.Condition.isnull()) & ((data.Strain == 'ura3') | (data.Strain=='trmB') | (data.Strain == 'rosR'))]
	# temp = data[(data.Condition.isnull()) & ((data.Strain=='trmB') | (data.Strain == 'rosR'))]
	# temp = data[(data.Condition.isnull()) & ((data.Strain=='trmB') | (data.Strain == 'rosR') | (data.Strain == 'trh2'))]
	temp = data[(data.Condition.isnull()) & ((data.Strain=='trmB') | (data.Strain == 'rosR') | (data.Strain == 'trh2') | (data.Strain == 'idr1'))]
	# temp = data[(data.Condition.isnull()) & (data.Strain!='ura3')]

	y = temp[time].T.values
	y = np.log2(y)
	y = y - y[0,:]

	x = time
	x = (x-x.mean())/x.std()
	x = x[:,None]

	effect = patsy.dmatrix('C(Strain)+0',temp)
	effect = np.where(effect!=0)[1][:,None]

	# effect = (temp.Strain != "ura3").astype(int).values[:,None]

	return x,y,effect
Esempio n. 25
0
def test_harmonic_transform():
    x = np.arange(735688, 735688 + 100, 1)
    design = patsy.dmatrix("0 + harm(x, 1)")

    truth = np.vstack((np.cos(2 * np.pi / 365.25 * x), np.sin(2 * np.pi / 365.25 * x))).T

    np.testing.assert_equal(np.asarray(design), truth)
Esempio n. 26
0
    def create_node(self, name, kwargs, data):
        reg = kwargs['regressor']
        # order parents according to user-supplied args
        args = []
        for arg in reg['params']:
            for parent_name, parent in kwargs['parents'].items():
                if parent_name == arg:
                    args.append(parent)

        parents = {'args': args}

        # Make sure design matrix is kosher
        dm = dmatrix(reg['model'], data=data)
        if math.isnan(dm.sum()):
            raise NotImplementedError('DesignMatrix contains NaNs.')

        def func(args, design_matrix=dmatrix(reg['model'], data=data), link_func=reg['link_func']):
            # convert parents to matrix
            params = np.matrix(args)
            # Apply design matrix to input data
            if design_matrix.shape[1] != params.shape[1]:
                raise NotImplementedError('Missing columns in design matrix. You need data for all conditions for all subjects.')
            predictor = link_func(pd.DataFrame((design_matrix * params).sum(axis=1), index=data.index))

            return pd.DataFrame(predictor, index=data.index)

        return self.pymc_node(func, kwargs['doc'], name, parents=parents, trace=self.keep_regressor_trace)
Esempio n. 27
0
    def from_formula(cls,
                     formula,
                     vc_formulas,
                     data,
                     family=None,
                     vcp_p=1,
                     fe_p=2):
        """
        Fit a BayesMixedGLM using a formula.

        Parameters
        ----------
        formula : string
            Formula for the endog and fixed effects terms (use ~ to separate
            dependent and independent expressions).
        vc_formulas : dictionary
            vc_formulas[name] is a one-sided formula that creates one
            collection of random effects with a common variance
            prameter.  If using a categorical expression to produce
            variance components, note that generally `0 + ...` should
            be used so that an intercept is not included.
        data : data frame
            The data to which the formulas are applied.
        family : genmod.families instance
            A GLM family.
        vcp_p : float
            The prior standard deviation for the logarithms of the standard
            deviations of the random effects.
        fe_p : float
            The prior standard deviation for the fixed effects parameters.
        """

        ident = []
        exog_vc = []
        vcp_names = []
        j = 0
        for na, fml in vc_formulas.items():
            mat = patsy.dmatrix(fml, data, return_type='dataframe')
            exog_vc.append(mat)
            vcp_names.append(na)
            ident.append(j * np.ones(mat.shape[1]))
            j += 1
        exog_vc = pd.concat(exog_vc, axis=1)
        vc_names = exog_vc.columns.tolist()

        ident = np.concatenate(ident)

        model = super(_BayesMixedGLM, cls).from_formula(
            formula,
            data=data,
            family=family,
            subset=None,
            exog_vc=exog_vc,
            ident=ident,
            vc_names=vc_names,
            vcp_names=vcp_names,
            fe_p=fe_p,
            vcp_p=vcp_p)

        return model
Esempio n. 28
0
def logisticpatsy():
    df = pd.read_csv("train.csv")
    cleanpatsy(df)
    #y, X = dmatrices('Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Cabin + Embarked',df, return_type="dataframe")
    y, X = dmatrices('Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Cabin + Embarked',df, return_type="dataframe")
    y = np.ravel(y)

    model = LogisticRegression()
    model = model.fit(X, y)

    # check the accuracy on the training set
    print model.score(X, y)





    # # evaluate the model by splitting into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    model2 = LogisticRegression()
    model2.fit(X_train, y_train)
    predicted = model2.predict(X_test)
    print metrics.accuracy_score(y_test, predicted)


    dftest = pd.read_csv("test.csv")
    cleanpatsy(dftest)
    X = dmatrix('Pclass + Sex + Age + SibSp + Parch + Fare + Cabin + Embarked',dftest, return_type="dataframe")
    predict_survive = model.predict(X)
    result = {'PassengerId':dftest.PassengerId, 'Survived':predict_survive}
    dfresult = pd.DataFrame(result)
    dfresult.to_csv("result.csv",index=False)
    print pd.DataFrame(zip(X.columns, np.transpose(model.coef_)))
Esempio n. 29
0
def runPyCombat(fl):
    """ This method was added specifically for AltAnalyze version 2.0.8 (not in the original GitHub code) """
    print "Running Combat...",
    expr_input_dir = fl.ExpFile()
    pheno_dir = formatPhenoFile(fl)

    moved_exp_dir = export.findParentDir(expr_input_dir) + "Non-Combat/" + export.findFilename(expr_input_dir)
    try:
        export.copyFile(expr_input_dir, moved_exp_dir)
        print "Moved original expression file to:"
        print "\t" + moved_exp_dir
        ### now overwrite the origin excluding the commented rows
        export.cleanFile(expr_input_dir, removeExtra="#")  ### remove comments from the original file
    except Exception:
        None

    pheno = pa.read_table(pheno_dir, index_col=0)
    dat = pa.read_table(expr_input_dir, index_col=0)

    mod = patsy.dmatrix("group", pheno, return_type="dataframe")
    t = time.time()
    # print dat, pheno.batch, mod;sys.exit()
    ebat = combat(dat, pheno.batch, mod, 0)
    print "...Combat completed in %.2f seconds" % (time.time() - t)

    print "Original expression file over-written with batch effect removal results..."
    ebat.to_csv(expr_input_dir, sep="\t")
def prediction_given_R(all_samples, formula_variables, entries, prediction, outcomes):

    from patsy import dmatrix
    entries_new_R = entries.copy()
    formula = " + ".join(formula_variables)
    Rs_to_test = arange(0,1.1,.1)
    
    prediction_given_R = pd.Panel(items=outcomes,
                                  major_axis=arange(all_samples.shape[1]), 
                                  minor_axis=Rs_to_test)

    for R in Rs_to_test:
        print(R)
        entries_new_R['Relatedness'] = R
        predictors = array(dmatrix(formula, entries_new_R))

        for outcome in outcomes:   
            print(outcome)
            betas = all_samples[outcome][['Intercept']+formula_variables].values
            prediction_given_R.ix[outcome, :, R] = prediction(predictors,betas.T).mean(axis=0)


    quantiles_for_prediction = [.025, .5, .975]
    prediction_quantiles_given_R = pd.Panel(items=prediction_given_R.items,
                                            major_axis = quantiles_for_prediction,
                                            minor_axis = prediction_given_R.minor_axis
                                            )
    for item in prediction_quantiles_given_R.items:
        prediction_quantiles_given_R[item] = prediction_given_R[item].quantile(quantiles_for_prediction)

    return prediction_quantiles_given_R
Esempio n. 31
0
 def generate_sample_description(
         self,
         num_conditions=2,
         num_batches=4,
         intercept_scale: bool = False,
         **kwargs
 ):
     self.sim_design_loc, self.sample_description = generate_sample_description(
         self.nobs,
         num_conditions=num_conditions,
         num_batches=num_batches,
         **kwargs
     )
     if intercept_scale:
         self.sim_design_scale = patsy.dmatrix("~1", self.sample_description)
     else:
         self.sim_design_scale = self.sim_design_loc
Esempio n. 32
0
        def func(
            args,
            design_matrix=dmatrix(reg["model"], data=data),
            link_func=reg["link_func"],
        ):
            # convert parents to matrix
            params = np.matrix(args)
            # Apply design matrix to input data
            if design_matrix.shape[1] != params.shape[1]:
                raise NotImplementedError(
                    "Missing columns in design matrix. You need data for all conditions for all subjects."
                )
            predictor = link_func(
                pd.DataFrame((design_matrix * params).sum(axis=1), index=data.index)
            )

            return pd.DataFrame(predictor, index=data.index)
Esempio n. 33
0
    def __init__(self, sample, formula=None, design=None):
        assert type(sample) is Sample, 'sample must be of type Sample'

        self.sample     = sample
        self.covariates = sample.covariates
        self.statistics = sample.statistics

        if (formula is not None) and (design is None):
            dmat = dmatrix(formula, self.covariates, eval_env=-1)
            parameter_names = dmat.design_info.column_names
            design = np.asarray(dmat)
        else:
            parameter_names = ['Intercept']

        self.formula = formula
        self.parameter_names = parameter_names
        self.design = design
def _transform_predict_exog(model, exog, design_info=None):
    """transform exog for predict using design_info

    Note: this is copied from base.model.Results.predict and converted to
    standalone function with additional options.


    """

    is_pandas = _is_using_pandas(exog, None)

    exog_index = exog.index if is_pandas else None

    if design_info is None:
        design_info = getattr(model.data, 'design_info', None)

    if design_info is not None and (exog is not None):
        from patsy import dmatrix
        if isinstance(exog, pd.Series):
            # we are guessing whether it should be column or row
            if (hasattr(exog, 'name') and isinstance(exog.name, str)
                    and exog.name in design_info.describe()):
                # assume we need one column
                exog = pd.DataFrame(exog)
            else:
                # assume we need a row
                exog = pd.DataFrame(exog).T
        orig_exog_len = len(exog)
        is_dict = isinstance(exog, dict)
        exog = dmatrix(design_info, exog, return_type="dataframe")
        if orig_exog_len > len(exog) and not is_dict:
            import warnings
            if exog_index is None:
                warnings.warn('nan values have been dropped', ValueWarning)
            else:
                exog = exog.reindex(exog_index)
        exog_index = exog.index

    if exog is not None:
        exog = np.asarray(exog)
        if exog.ndim == 1 and (model.exog.ndim == 1
                               or model.exog.shape[1] == 1):
            exog = exog[:, None]
        exog = np.atleast_2d(exog)  # needed in count model shape[1]

    return exog, exog_index
Esempio n. 35
0
def match_and_filter(table, metadata, formula,
                     min_sample_count, min_feature_count):
    """ Matches and aligns biom and metadata tables.

    This will also return the patsy representation.

    Parameters
    ----------
    table : biom.Table
        Table of abundances
    metadata : pd.DataFrame
        Sample metadata

    Returns
    -------
    table : biom.Table
        Filtered biom table
    metadata : pd.DataFrame
        Sample metadata
    """
    # match them

    def sample_filter(val, id_, md):
        return id_ in metadata.index and np.sum(val) > min_sample_count

    def read_filter(val, id_, md):
        return np.sum(val > 0) > min_feature_count

    table = table.filter(sample_filter, axis='sample', inplace=False)
    table = table.filter(read_filter, axis='observation', inplace=False)

    metadata = metadata.loc[table.ids(axis='sample')]
    metadata = metadata.loc[~metadata.index.duplicated(keep='first')]

    def sort_f(xs):
        return [xs[metadata.index.get_loc(x)] for x in xs]

    table = table.sort(sort_f=sort_f, axis='sample')
    design = dmatrix(formula, metadata, return_type='dataframe')
    design = design.dropna()

    def design_filter(val, id_, md):
        return id_ in design.index

    table = table.filter(design_filter, axis='sample')
    return table, metadata, design
def load_census_data(fname_census, fname_census_tst):
    ''' load UCI Adult Census dataset
    keep it as training + test to match the avaialble classif. accuracy results
    '''

    census = pd.read_table(fname_census, sep = ',', header = False, 
                           names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 
                                    'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 
                                    'hours_per_week', 'native_country', 'label'])
    census_tst = pd.read_table(fname_census_tst, sep = ',', header= False, names = census.columns)

    ### removing NaNs
    print("Removing rows with missing data")
    census = census.dropna()
    census_tst = census_tst.dropna()
    #census_tst.index = census_tst.index + len(census)  ### change the index to enable concatenation

    inds_tr = np.arange(len(census))
    inds_tst = np.arange(len(inds_tr), len(inds_tr) + len(census_tst))
    
    census = pd.concat([census, census_tst], ignore_index = True)

    ### find out what kind of features we're dealing with
    col_names = [x.replace(' ', '_') for x in census.columns]

    print("WARNING: patsy is dropping the _reference_ label, need to disable this.. ")
    patsy_formula = '+'.join(col_names) + '-1'  ### -1 to remove intercept

    X = patsy.dmatrix(patsy_formula, census, return_type = 'dataframe')

    #del X['Intercept']  ### is there a way to do it in dmatrix directly
    #new_X_col_names = [x.replace(']','').replace('[T. ', ':') for x in X.columns]
    new_X_col_names = [x.replace(']','').replace('[T.', ':').replace('[ ', ':').replace(' ', '') for x in X.columns]
    X.columns = new_X_col_names

    label_col = 'label:>50K'
    ind_label = np.where(X.columns == label_col)[0][0]
    cols_reorder = X.columns.tolist()
    cols_reorder[ind_label] = cols_reorder[0]
    cols_reorder[0] = label_col
    X = X[cols_reorder]

    X_tr = X.iloc[inds_tr, :]
    X_tst = X.iloc[inds_tst, :]

    return(census, X_tr, X_tst)
Esempio n. 37
0
    def fit(self):
        """
        Fit the model using maximum likelihood estimation. Uses either the ChoiceModels
        or PyLogit estimation engine as appropriate.

        Returns
        -------
        MultinomialLogitResults() object.

        """
        if (self._estimation_engine == 'PyLogit'):

            m = pylogit.create_choice_model(
                data=self._df,
                obs_id_col=self._observation_id_col,
                alt_id_col=self._alternative_id_col,
                choice_col=self._choice_col,
                specification=self._model_expression,
                names=self._model_labels,
                model_type='MNL')

            m.fit_mle(init_vals=self._initial_coefs)
            results = MultinomialLogitResults(
                estimation_engine=self._estimation_engine,
                model_expression=self._model_expression,
                results=m)

        elif (self._estimation_engine == 'ChoiceModels'):

            dm = dmatrix(self._model_expression, data=self._df)

            chosen = np.reshape(self._df[[self._choice_col]].values,
                                (self._numobs, self._numalts))

            log_lik, fit = mnl_estimate(np.array(dm), chosen, self._numalts)

            result_params = dict(log_likelihood=log_lik,
                                 fit_parameters=fit,
                                 x_names=dm.design_info.column_names)

            results = MultinomialLogitResults(
                estimation_engine=self._estimation_engine,
                model_expression=self._model_expression,
                results=result_params)

        return results
Esempio n. 38
0
    def predict(self, data):
        """
        Predict new values by running data through the fit model.

        Parameters
        ----------
        data : pandas.DataFrame
            Table with columns corresponding to the RHS of `model_expression`.

        Returns
        -------
        predicted : ndarray
            Array of predicted values.

        """
        model_design = dmatrix(self._rhs, data=data, return_type='dataframe')
        return model_design.dot(self.params).values
Esempio n. 39
0
def glm_wrap_continuous(conn,
                        pheno,
                        contrast,
                        regressors,
                        report=False,
                        fast=False):
    # Make sure pheno and conn have the same number of cases
    if not conn.shape[0] == pheno.shape[0]:
        print(
            f'Conn ({conn.shape[0]}) and pheno ({pheno.shape[0]}) must be same number of cases'
        )

    # Define the subset of the sample
    sub_mask = find_subset(pheno, contrast)
    sub_conn = conn[sub_mask, :]
    sub_pheno = pheno.loc[sub_mask]
    n_sub = sub_pheno.shape[0]
    n_data = sub_conn.shape[1]
    sub_conn_stand = standardize(sub_conn, np.ones(n_sub).astype(bool))

    if report:
        print(
            f'Selected sample based on contrast variable {contrast}.\n'
            f'Found {n_sub} subjects with no missing data for {contrast}\n'
            f'original sample: n={pheno.shape[0]}; new sample: n={n_sub}\n'
            f'{n_data} data points available\n'
            f'standardized estimators are based on all subjects with no missing data for {contrast}'
        )

    formula = ' + '.join((regressors, contrast))
    design_matrix = pat.dmatrix(formula, sub_pheno, return_type='dataframe')

    if fast:
        betas = fast_glm(sub_conn, design_matrix, contrast)
        table = pd.DataFrame(data={'betas': betas})
    else:
        betas, pvals = glm(sub_conn, design_matrix, contrast)
        stand_betas, _ = glm(sub_conn_stand, design_matrix, contrast)
        table = pd.DataFrame(data={
            'betas': betas,
            'stand_betas': stand_betas,
            'pvals': pvals
        })

    return table
Esempio n. 40
0
    def exposure_model(self, model, custom_model=None, bound=False, print_results=True):
        """Estimation of Pr(A=1|L), which is termed as g(A=1|L) in the literature

        Parameters
        ----------
        model : str
            Independent variables to predict the exposure. Example) 'var1 + var2 + var3'
        custom_model : optional
            Input for a custom model that is used in place of the logit model (default). The model must have the
            "fit()" and  "predict()" attributes. Both sklearn and supylearner are supported as custom models. In the
            background, TMLE will fit the custom model and generate the predicted probablities
        bound : float, list, optional
            Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations.
            Specifying this argument can improve finite sample performance for random positivity violations. However,
            truncating weights leads to additional confounding. Default is False, meaning no truncation of
            predicted probabilities occurs. Providing a single float assumes symmetric trunctation, where values below
            or above the threshold are set to the threshold value. Alternatively a list of floats can be provided for
            asymmetric trunctation, with the first value being the lower bound and the second being the upper bound
        print_results : bool, optional
            Whether to print the fitted model results. Default is True (prints results)
        """
        self._exp_model = self.exposure + ' ~ ' + model
        self.__mweight = model

        # Step 3) Estimation of g-model (exposure model)
        if custom_model is None:
            fitmodel = propensity_score(self.df, self._exp_model, print_results=print_results)
            self.g1W = fitmodel.predict(self.df)

        # User-specified prediction model
        else:
            warnings.warn("TMLE can result in confidence intervals below nominal coverage when used with "
                          "machine learning algorithms. TMLE will no longer support custom machine learning "
                          "models in v0.9.0")
            self._exp_model_custom = True
            data = patsy.dmatrix(model + ' - 1', self.df)
            self.g1W = exposure_machine_learner(xdata=np.asarray(data), ydata=np.asarray(self.df[self.exposure]),
                                                ml_model=custom_model, print_results=print_results)

        self.g0W = 1 - self.g1W
        if bound:  # Bounding predicted probabilities if requested
            self.g1W = _bounding_(self.g1W, bounds=bound)
            self.g0W = _bounding_(self.g0W, bounds=bound)

        self._fit_exposure_model = True
Esempio n. 41
0
    def _fit_transform(self, data, y=None):
        eval_env = EvalEnvironment.capture(self.eval_env, reference=2)
        formula = _drop_intercept(self.formula, self.add_intercept)

        design = dmatrix(formula,
                         data,
                         eval_env=eval_env,
                         NA_action=self.NA_action,
                         return_type='dataframe')
        self.design_ = design.design_info

        if self.return_type == 'dataframe':
            return design
        else:
            return np.array(design)

        self.feature_names_ = design.design_info.column_names
        return np.array(design)
def wrap_subtype_stability(arg):
    data_stack = arg['data_stack']
    sbt_idx = arg['sbt_idx']
    dist_thr = arg['dist_thr']
    part_thr = arg['part_thr']
    regressors = arg['regressors']
    pheno = arg['pheno']
    # Regress nuisance for these individuals first
    design_matrix = pat.dmatrix(regressors, data=pheno.iloc[sbt_idx])
    residuals = asdfc.stats.nuisance_correction(data_stack[sbt_idx, ...],
                                                design_matrix,
                                                n_jobs=-1)
    # Then extract the subtype
    part, _, _ = asdfc.stats.subtype_partition(residuals,
                                               mode='core',
                                               dist_thr=dist_thr,
                                               part_thr=part_thr)
    return part
Esempio n. 43
0
    def fit(self, X, y):
        # Build the design matrix via a tensor basis expansion of natural spline bases
        data = {'x{}'.format(i + 1): x for i, x in enumerate(X.T)}
        design_matrix = dmatrix(
            "te(" + ",".join([
                'cr(x{}, df={})'.format(i + 1, self.df)
                for i in range(X.shape[1])
            ]) + ", constraints='center')", data)

        # Save the design information for future predictions
        self.design_info = design_matrix.design_info

        # Fit the model using the basis
        mod = smf.quantreg('y ~ x - 1', {'y': y, 'x': design_matrix})
        if np.isscalar(self.quantiles):
            self.model = mod.fit(q=self.quantiles)
        else:
            self.model = [mod.fit(q=q) for q in self.quantiles]
Esempio n. 44
0
    def __init__(self, t, y, _F, _gradF, dof=5, deg=3, ndiff=1):
        assert t.size == y.size, 't and y mist have the same size'

        self.t = t
        # self.y = y
        self.y = (y - y.min())/(y.max() - y.min())
        self.m = t.size
        self.k = dof - ndiff

        self._F = _F
        self._gradF = _gradF

        # self.theta0 = theta0

        str_input = f"bs(x, df={dof}, degree={deg}, include_intercept=True) - 1"
        self.A = dmatrix(str_input, {"x": t})
        # self.D = L(dof, ndiff)
        self.D = L(self.m, ndiff) @ self.A
def model_experimental(data, formula, baseline_index=None):

    cell_types = data.var.index.to_list()

    # Get count data
    data_matrix = data.X.astype("float32")

    # Build covariate matrix from R-like formula
    covariate_matrix = pt.dmatrix(formula, data.obs)
    covariate_names = covariate_matrix.design_info.column_names[1:]
    covariate_matrix = covariate_matrix[:, 1:]

    return NoBaselineModelExperimental(
        covariate_matrix=np.array(covariate_matrix),
        data_matrix=data_matrix,
        cell_types=cell_types,
        covariate_names=covariate_names,
        formula=formula)
Esempio n. 46
0
def prep_model(csv_name):
    """
    Loads CSV file of merged pandas DataFrame, cleans, converts categorical variable to a one-hot configuration
    Separates into dependent and independent features for regression
    :param csv_name: Name of CSV file to import, assumed to be in the ../data directory
    :return: x, y pandas DataFrames representing the features and dependent variable to perform regression upon
    """
    df = pd.read_csv('../data/' + csv_name)
    df = df.drop(columns=['Unnamed: 0', 'index', 'Unnamed: 0_y'],
                 errors='ignore')
    housing_categorical = patsy.dmatrix('type',
                                        data=df,
                                        return_type='dataframe')
    df = df.join(housing_categorical)
    df = df.dropna()
    y = df['price']
    x = df.drop(columns=['price', 'type', 'hood', 'title', 'link'])
    return x, y
Esempio n. 47
0
def transform_exog_to_model(fit, exog):
    transform = True
    self = fit

    # The following is lifted straight from statsmodels.base.model.Results.predict()
    if transform and hasattr(self.model, 'formula') and exog is not None:
        from patsy import dmatrix
        exog = dmatrix(self.model.data.orig_exog.design_info.builder, exog)

    if exog is not None:
        exog = np.asarray(exog)
        if exog.ndim == 1 and (self.model.exog.ndim == 1
                               or self.model.exog.shape[1] == 1):
            exog = exog[:, None]
        exog = np.atleast_2d(exog)  # needed in count model shape[1]

    # end lifted code
    return exog
Esempio n. 48
0
def construct_random_effects(groups, data, n_vars):
    re_vars, re_groupings = list(zip(*groups))
    re_vars, re_groupings = set(re_vars), set(re_groupings)
    Zdict = dict(
        zip(re_vars, [
            _check_np(patsy.dmatrix(x, data=data, return_type='dataframe'))
            for x in re_vars
        ]))
    Jdict = dict(zip(re_groupings, [dummy(data[x]) for x in re_groupings]))
    dim_dict = {}
    Z = []
    for x, y in groups:
        Ji, Xi = Jdict[y], Zdict[x]
        dim_dict[y] = {'n_groups': Ji.shape[1], 'n_vars': Xi.shape[1]}
        Zi = khatri_rao(Ji.T, Xi.T).T
        Z.append(Zi)
    Z = np.concatenate(Z, axis=1)
    return Z, dim_dict
Esempio n. 49
0
def survival(row,
             phenotype_df,
             duration_col='T',
             event_col='E',
             other_cols=[]):
    """
    duration_col: survival time
    event_col: whether an event (death or other) has ocured or not. 0 for no, 1 for yes
    other_cols: other variables to consider in the regression
    """
    phenotype_df = phenotype_df.T
    phenotype_df = phenotype_df.join(row.astype(float))
    phenotype_df[duration_col] = phenotype_df[duration_col].astype(float)
    phenotype_df[event_col] = phenotype_df[event_col].astype(int)

    # The following lines deal with char conflicts in patsy formulas
    duration_col = duration_col.replace(' ',
                                        '_').replace('.',
                                                     '_').replace('-', '_')
    event_col = event_col.replace(' ', '_').replace('.', '_').replace('-', '_')
    other_cols = [
        x.replace(' ', '_').replace('.', '_').replace('-', '_')
        for x in other_cols
    ]
    row.name = row.name.replace(' ', '_').replace('.', '_').replace('-', '_')
    phenotype_df.columns = [
        x.replace(' ', '_').replace('.', '_').replace('-', '_')
        for x in phenotype_df.columns
    ]

    formula = row.name + ' + ' + duration_col + ' + ' + event_col
    if not not other_cols:
        other_cols = [
            x.replace(' ', '_').replace('.', '_') for x in other_cols
        ]
        formula = formula + ' + ' + ' + '.join(other_cols)
    X = patsy.dmatrix(formula_like=formula,
                      data=phenotype_df,
                      return_type='dataframe')
    X = X.drop(['Intercept'], axis=1)
    cph = CoxPHFitter()
    cph.fit(X, duration_col=duration_col, event_col=event_col)
    result = cph.summary.loc[row.name]
    return result
Esempio n. 50
0
    def set_changepoints(self, changepoints, validate=True):
        # nodes are static unless logic below determines otherwise
        self.nodes_parametric = False

        # trivial case
        if changepoints is None:
            self.changepoint_coefs = None
            return

        # if we have received a list
        if isinstance(changepoints, list):

            # if we have received parametric node placement specifications
            if isinstance(changepoints[0], str):
                self.nodes_parametric = True
                if len(changepoints) > 1:
                    raise ValueError(
                        "Only a single changepoint may be specified currently."
                    )
                if self.data is None:
                    raise ValueError(
                        "Cannot specify changepoints without valid data.")
                if "~" in changepoints[0]:
                    raise ValueError(
                        "Received an invalid changepoint specification.  Changepoints may not specify an outcome variable."
                    )

                # insert dummy intercept (left, terminal node) into list of changepoints
                # we should probably make users do this at some point
                self.changepoint_specifications = ['1'] + changepoints
                # extract dmatrices, hacking the first dummy
                self.changepoint_dmatrices = [
                    pd.DataFrame({'Intercept': np.ones(self.data.shape[0])}),
                    patsy.dmatrix(changepoints[0],
                                  self.data,
                                  return_type='dataframe')
                ]

            else:
                raise ValueError("Changepoints must be patsy strings.")

        if validate:
            # validate
            self.validate_parameters()
def survival_npcs(row, phenotype_df, duration_col = 'T', event_col = 'E', other_cols = []):
    """
    duration_col: survival time
    event_col: whether an event (death or other) has ocured or not. 0 for no, 1 for yes
    other_cols: other variables to consider in the regression
    """

    row.name = row.name.replace(' ','_').replace('.','_').replace('-','_')
    row_npcs = row
    columns_names = []
    formula = ''
    for n in range(len(row_npcs[0])):
        pc_name = row.name + '_pc' + str(n+1)
        columns_names.append(pc_name)
        formula = formula + pc_name + ' + '

    row_npcs = pd.DataFrame(row_npcs.tolist(), index = row_npcs.index)
    row_npcs.columns = columns_names

    # phenotype_df = phenotype_df.join(row.astype(float))
    phenotype_df = phenotype_df.join(row_npcs.astype(float))

    phenotype_df[duration_col] = phenotype_df[duration_col].astype(float)
    phenotype_df[event_col] = phenotype_df[event_col].astype(int)

    # The following lines deal with char conflicts in patsy formulas
    duration_col = duration_col.replace(' ','_').replace('.','_').replace('-','_')
    event_col = event_col.replace(' ','_').replace('.','_').replace('-','_')
    other_cols = [x.replace(' ','_').replace('.','_').replace('-','_') for x in other_cols]
    # row.name = row.name.replace(' ','_').replace('.','_').replace('-','_')
    phenotype_df.columns = [x.replace(' ','_').replace('.','_').replace('-','_') for x in phenotype_df.columns]

    # formula = row.name + ' + ' + duration_col + ' + ' + event_col
    formula = formula + duration_col + ' + ' + event_col
    if not not other_cols:
        other_cols = [x.replace(' ','_').replace('.','_') for x in other_cols]
        formula = formula + ' + ' + ' + '.join(other_cols)

    X = patsy.dmatrix(formula_like = formula, data = phenotype_df, return_type = 'dataframe')
    X = X.drop(['Intercept'], axis = 1)
    cph = lifelines.CoxPHFitter()
    cph.fit(X, duration_col = duration_col, event_col = event_col)
    result = cph.summary.loc[columns_names]
    return result
def estimate_sorted_spike_encoding_model(train_position_info,
                                         train_spikes_data, place_bin_centers):
    '''The conditional intensities for each state (Outbound-Forward,
    Outbound-Reverse, Inbound-Forward, Inbound-Reverse)

    Parameters
    ----------
    train_position_info : pandas dataframe
    train_spikes_data : array_like
    place_bin_centers : array_like, shape=(n_parameters,)

    Returns
    -------
    combined_likelihood_kwargs : dict

    '''
    formula = ('1 + trajectory_direction * '
               'bs(linear_distance, df=10, degree=3)')
    design_matrix = dmatrix(formula,
                            train_position_info,
                            return_type='dataframe')
    fit = [
        glm_fit(spikes, design_matrix, ind)
        for ind, spikes in enumerate(train_spikes_data)
    ]

    inbound_predict_design_matrix = _predictors_by_trajectory_direction(
        'Inbound', place_bin_centers, design_matrix)
    outbound_predict_design_matrix = _predictors_by_trajectory_direction(
        'Outbound', place_bin_centers, design_matrix)

    inbound_conditional_intensity = _get_conditional_intensity(
        fit, inbound_predict_design_matrix)
    outbound_conditional_intensity = _get_conditional_intensity(
        fit, outbound_predict_design_matrix)

    conditional_intensity = np.vstack([
        outbound_conditional_intensity, outbound_conditional_intensity,
        inbound_conditional_intensity, inbound_conditional_intensity
    ]).T

    return dict(
        likelihood_function=poisson_likelihood,
        likelihood_kwargs=dict(conditional_intensity=conditional_intensity))
Esempio n. 53
0
    def get_formula_cols(formula, df, target_val=False, feature_vals=False):
        if target_val:
            formula = formula.split("~")[0]
        if feature_vals:
            formula = formula.split("~")[1]
        # test just the first 2 datapoints so it runs quicker?
        df = df.sample(2)
        cols = []
        for col in df.columns:

            try:
                if target_val | feature_vals:
                    tmp_mod = patsy.dmatrix(formula, df.drop(col, axis=1))
                else:
                    tmp_mod = patsy.dmatrices(formula, df.drop(col, axis=1))

            except:
                cols.append(col)
        return cols
Esempio n. 54
0
    def create_regression(self, formula: str, metadata: pd.DataFrame):
        """Generate design matrix for count regression modeling.

        :param formula: Design formula to use in model
        :type formula: str

        :param metadata: Metadata for design matrix
        :type metadata: pd.DataFrame
        """
        self.dmat = dmatrix(formula,
                            metadata.loc[self.sample_names],
                            return_type="dataframe")
        self.colnames = self.dmat.columns

        param_dict = {
            "p": self.dmat.shape[1],
            "x": self.dmat.values,
        }
        self.add_parameters(param_dict)
Esempio n. 55
0
def transform_with_patsy(formula, data, *args, **kwargs):
    try:
        # needs patsy v0.5.1 to support formula in Python 3.7
        # https://github.com/pydata/patsy/pull/131
        import patsy
    except ImportError:
        raise ImportError("'patsy' is required to transform with string formula")

    if '~' in formula:
        y, X = patsy.dmatrices(formula, data=data, return_type='dataframe',
                               *args, **kwargs)
        if len(y.shape) > 1 and y.shape[1] != 1:
            raise ValueError('target must be 1 dimensional')
        y = y.iloc[:, 0]
        return data._constructor(X, target=y)
    else:
        X = patsy.dmatrix(formula, data=data, return_type='dataframe',
                          *args, **kwargs)
        return data._constructor(X)
Esempio n. 56
0
    def exposure_model(self, model, custom_model=None, bound=None):
        """Estimation of the exposure model, Pr(A=1|W). This value is used as the denominator for the inverse
        probability weights.

        Parameters
        ----------
        model : str
            Independent variables to predict the exposure. Example) 'var1 + var2 + var3'
        custom_model : optional
            Input for a custom model that is used in place of the logit model (default). The model must have the
            "fit()" and  "predict()" attributes. Both sklearn and supylearner are supported as custom models. In the
            background, TMLE will fit the custom model and generate the predicted probablities
        bound : float, list, optional
            Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations.
            Specifying this argument can improve finite sample performance for random positivity violations. However,
            truncating weights leads to additional confounding. Default is False, meaning no truncation of
            predicted probabilities occurs. Providing a single float assumes symmetric trunctation, where values below
            or above the threshold are set to the threshold value. Alternatively a list of floats can be provided for
            asymmetric trunctation, with the first value being the lower bound and the second being the upper bound
        """
        self._g_model = self.exposure + ' ~ ' + model

        if custom_model is None:  # Standard parametric regression model
            fitmodel = propensity_score(self.df,
                                        self._g_model,
                                        print_results=self._verbose_)
            pred = fitmodel.predict(self.df)
        else:  # User-specified prediction model
            self._exp_model_custom = True
            data = patsy.dmatrix(model + ' - 1', self.df)
            pred = exposure_machine_learner(xdata=np.asarray(data),
                                            ydata=np.asarray(
                                                self.df[self.exposure]),
                                            ml_model=custom_model,
                                            pdata=np.asarray(data))

        if bound is not None:
            pred2 = bounding(ipw=pred, bound=bound)
            self._specified_bound_ = np.sum(np.where(pred2 == pred, 0, 1))
            pred = pred2

        self._denominator_ = np.where(self.df[self.exposure] == 1, pred,
                                      1 - pred)
Esempio n. 57
0
def design_mat(mod, numCovs, batch_levels):
    # require levels to make sure they are in the same order as we use in the
    # rest of the script.
    design = patsy.dmatrix("~ 0 + C(batch, levels=%s)" % str(batch_levels), mod, return_type="dataframe")

    mod = mod.drop(["batch"], axis=1)
    print >>sys.stderr, "found %i batches" % design.shape[1]
    other_cols = [c for i, c in enumerate(mod.columns) if not i in numCovs]
    factor_matrix = mod[other_cols]
    design = pa.concat((design, factor_matrix), axis=1)
    if numCovs is not None:
        print >>sys.stderr, "found %i numerical covariates..." % len(numCovs)
        for i, nC in enumerate(numCovs):
            cname = mod.columns[nC]
            print >>sys.stderr, "\t", cname
            design[cname] = mod[mod.columns[nC]]
    print >>sys.stderr, "found %i categorical variables:" % len(other_cols)
    print >>sys.stderr, "\t" + ", ".join(other_cols)
    return design
Esempio n. 58
0
def variable_effect(pheno,var,regressors,conn):
    """
    Test effect of continuous variable.
    
    pheno = dataframe:
        -filtered to be only relevant subjects (use mask_var)
    var = column from pheno
    regressors = list of strings, formatted for patsy
    connectomes = n_subjects x n_edges array
    
    Returns:
    table = n_edges
        - betas_std = including standardization on controls
        - pvalues = pvalues
        - qvalues = fdr corrected pvalues alpha = 0.05
    """
    
    n_edges = conn.shape[1]
    contrast = np.zeros(1 + len(regressors))
    contrast[0] = 1
    
    betas_std = np.zeros(n_edges)
    pvalues = np.zeros(n_edges)
        
    formula = ' + '.join((regressors + [var]))
    dmat = pat.dmatrix(formula, pheno, return_type='dataframe',NA_action='raise')
    
    mask_std = np.ones(pheno.shape[0]).astype(bool)
    conn_std = standardize(mask_std, conn)
    
    for edge in range(n_edges):
        model_std = sm.OLS(conn_std[:,edge],dmat)
        results_std = model_std.fit()
        betas_std[edge] = results_std.params[var]
        pvalues[edge] = results_std.pvalues[var]
        
    mt = multipletests(pvalues,method='fdr_bh')
    reject = mt[0]
    qvalues = mt[1]
    
    table = pd.DataFrame(np.array([betas_std,pvalues,qvalues,reject]).transpose(),
                         columns=['betas_std','pvalues','qvalues','reject'])
    return table
Esempio n. 59
0
    def filter_data_and_create_design_matrices(self):
        data_for_training = self.base_predictor.input_data.copy(deep=True)
        data_for_prediction = self.base_predictor.input_data.copy(deep=True)

        if self.channel == 'all':
            data_for_training = data_for_training.loc[
                data_for_training['days_since_first_order'] >=
                self.config['goal_horizon']]
            data_for_prediction = data_for_prediction.loc[
                data_for_prediction['days_since_first_order'] >=
                self.config['day_horizon']]
        else:
            data_for_training = data_for_training.loc[
                (data_for_training['days_since_first_order'] >=
                 self.config['goal_horizon'])
                & (data_for_training['attribution_level_1'] == self.channel)]
            data_for_prediction = data_for_prediction.loc[
                (data_for_prediction['days_since_first_order'] >=
                 self.config['day_horizon'])
                & (data_for_prediction['attribution_level_1'] == self.channel)]

        shuffled_training_data = data_for_training.sample(frac=1)

        training_columns = [
            i for i in iter_flatten(self.config['training_columns'])
        ]

        self.full_training_labels, filtered_training_data = patsy.dmatrices(
            self.config['goal_column'] + ' ~ 0 + ' +
            ' + '.join(training_columns),
            data=shuffled_training_data,
            return_type="dataframe")
        # fix enum column headers for xgb input requirements
        self.filtered_training_data = filtered_training_data.rename(
            columns=lambda x: x.replace("[", "(").replace("]", ")"))

        filtered_prediction_data = patsy.dmatrix('0 + ' +
                                                 ' + '.join(training_columns),
                                                 data=data_for_prediction,
                                                 return_type="dataframe")
        # fix enum column headers for xgb input requirements
        self.filtered_prediction_data = filtered_prediction_data.rename(
            columns=lambda x: x.replace("[", "(").replace("]", ")"))
Esempio n. 60
0
 def from_formula(cls, formula, data, priors=None,
                  vars=None, family='weibull', name='', model=None):
     import patsy
     ##### Here's how we parse the formula ######
     # Parse the formula and split into essential components
     #### TODO: Automatic selection of multivariate family based on dimension of inputs
     outcomes= formula.split("~")[0]
     # get time variables
     time_vars = [v.strip() for v in outcomes[outcomes.find("([")+2:outcomes.find("]")].split(",")]
     #get event times
     event_raw = outcomes[outcomes.find("],")+2:]
     event_vars = [v.strip() for v in event_raw[event_raw.find("[")+1:event_raw.find("])")].split(",")]
     # Now get x, times, and events
     x = patsy.dmatrix(formula.split("~")[1].strip(), data)
     y = data[time_vars].as_matrix()
     e = data[event_vars].as_matrix()
     labels = x.design_info.column_names
     return cls(x=np.asarray(x), y=np.asarray(y)[:, 0], e=np.asarray(e)[:, 0] ,intercept=False, labels=labels,
                priors=priors, vars=vars, family=family, name=name, model=model)