Ejemplo n.º 1
0
def fit_poisson(station_id, include_rebalance = False, initial_time = datetime(2001,1,1), final_time = datetime(2020,1,1), time_interval = '1H'):
    # Use the correct delta data
    station_updates = get_station_data(station_id)

    arrivals_departures = rebalance_station_poisson_data(station_updates, station_id, time_interval, include_rebalance = False)

    # Create design matrix for months, hours, and weekday vs. weekend.
    # We can't just create a "month" column to toss into our model, because it doesnt
    # understand what "June" is. Instead, we need to create a column for each month
    # and code each row according to what month it's in. Ditto for hours and weekday (=1).

    y_arr, X_arr = patsy.dmatrices("arrivals ~ C(months, Treatment) + C(hours, Treatment) + C(weekday_dummy, Treatment)", arrivals_departures, return_type='dataframe')
    y_dep, X_dep = patsy.dmatrices("departures ~ C(months, Treatment) + C(hours, Treatment) + C(weekday_dummy, Treatment)", arrivals_departures, return_type='dataframe')

    y_dep[pd.isnull(y_dep)] = 0

    # Fit poisson distributions for arrivals and departures, print results
    arr_poisson_model = sm.Poisson(y_arr, X_arr)
    arr_poisson_results = arr_poisson_model.fit(disp=0)

    dep_poisson_model = sm.Poisson(y_dep, X_dep)
    dep_poisson_results = dep_poisson_model.fit(disp = 0)

    # Calculate Error of the Above Models
    print type(y_arr-arr_poisson_results.fittedvalues.resid)

    error = sum((y_arr-arr_poisson_results.fittedvalues)**2)+sum((y_dep-dep_poisson_results.fittedvalues)**2) 

    # print arr_poisson_results.summary(), dep_poisson_results.summary()

    poisson_results = [arr_poisson_results, dep_poisson_results, error]

    return poisson_results
Ejemplo n.º 2
0
def pandashandler(formula_like, data):
    """
    process a pysal model signature and convert an equation/formula pair into a
    pysal-specific object
    """
    if '||' in formula_like:
        mu, inst = formula_like.split('||')
        y, X = p.dmatrices(mu + '-1' , data=data)
        yend, q  = p.dmatrices(inst + '-1', data=data)
        rargs = [y,X,yend,q]
        rargs = [asarray(i) for i in rargs]
        name_y, name_x = mu.strip(' ').split('~')
        name_x = name_x.split('+')
        name_yend, name_q = inst.strip(' ').split('~')
        name_yend = [name_yend]
        name_q = name_q.split('+')
        names = {"name_y":name_y,
                 "name_x":name_x, 
                 "name_yend":name_yend,
                 "name_q":name_q}
    else:
        y, X = p.dmatrices(formula_like + '-1', data=data)
        rargs = [asarray(y), asarray(X)]
        name_y, name_x = formula_like.strip(' ').split('~')
        name_x = name_x.split('+')
        names = {"name_y":name_y,
                 "name_x":name_x}

    return rargs, names
Ejemplo n.º 3
0
def stepwiseInit(upperScope,
                    dataFrame,
                    lowerScope=None,
                    startScope=None,
                    trace=False,
                    traceFile=stdout,
                    groupVars=False,
                    penaltyFn=stepwise_penalties.AICc()) :
    #The first set of operations sets up the lower and upper scopes and infers defaults
    #if they are not given
    env = patsy.EvalEnvironment.capture()     
    upperScopeDesc = patsy.ModelDesc.from_formula(upperScope, env)
    startScopeDesc = None if startScope is None else ModelDesc.from_formula(startScope,env)
    lowerScopeDesc = None if lowerScope is None else ModelDesc.from_formula(lowerScope,env)
    if not lowerScope and patsy.Term([]) not in upperScopeDesc.rhs_termlist :
        raise StepwiseError("A lower scope of the model search must be specified when " + \
        "the upperScope does not contain an intercept")
    if not lowerScope : #build a formula with only an intercept
        lowerScopeDesc = patsy.ModelDesc(upperScopeDesc.lhs_termlist,[patsy.Term([])])
        lowerScope     = lowerScopeDesc.describe()
    if not startScope :
        startScopeDesc = lowerScopeDesc
        startScope     = lowerScopeDesc.describe()
    #TODO: check that lower scope is consistant with upper scope
    #TODO: check that startingscope is consistent with lower and upper scopes
    rhs_set = set(upperScopeDesc.rhs_termlist)
    for item in lowerScopeDesc.rhs_termlist :
        if item not in rhs_set : raise StepWiseError("term " + item + " from formula:\n" + \
            lowerScope + "\nnot found in:\n" + \
            upperScope)
    for item in startScopeDesc.rhs_termlist :
        if item not in rhs_set : 
            raise StepWiseError("term " + item + " from formula:\n" + \
                        startScope + "\nnot found in:\n" + \
                                    upperScope)

    y,X      = patsy.dmatrices(upperScope, data=dataFrame)
    y,Xprime = patsy.dmatrices(startScope, data=dataFrame)
    y,Xlower = patsy.dmatrices(lowerScope, data=dataFrame) 
    active   = np.zeros(X.shape[1],dtype=np.bool)
    lower_active = active.copy()
    lowerMsk = active.copy()
    assert y.shape[1] == 1, "Multiple responses not yet supported."
    y = y.flatten()
    featMap = dict([(name,index) for index,name in enumerate(X.design_info.column_names)]) 
    for feat in Xprime.design_info.column_names :
        active[featMap[feat]] = True
    for feat in Xlower.design_info.column_names :
        lower_active[featMap[feat]] = True
    #next step: fit model using only the active set of features
    beta, betaSigmaSq, SSE, df, Q = qr_based_solver.solve(X[:,active],y.flatten())

    residWithMean = y - np.mean(y)
    SSTO = np.dot(residWithMean,residWithMean)
    summary = computeSummary(beta,betaSigmaSq,SSE,SSTO,X.shape[0],df,
                                Xprime.design_info.column_names,startScope)
    
    return StepwiseFitter(LinearModelFit(summary.beta,summary),X,y,X.design_info.column_names,
                            upperScopeDesc.lhs_termlist,active,lower_active,penaltyFn,trace=trace,
                            traceFile=traceFile, groupVars=groupVars)
Ejemplo n.º 4
0
def handle_formula_data(Y, X, formula, depth=0, missing='drop'):
    """
    Returns endog, exog, and the model specification from arrays and formula

    Parameters
    ----------
    Y : array-like
        Either endog (the LHS) of a model specification or all of the data.
        Y must define __getitem__ for now.
    X : array-like
        Either exog or None. If all the data for the formula is provided in
        Y then you must explicitly set X to None.
    formula : str or patsy.model_desc
        You can pass a handler by import formula_handler and adding a
        key-value pair where the key is the formula object class and
        the value is a function that returns endog, exog, formula object

    Returns
    -------
    endog : array-like
        Should preserve the input type of Y,X
    exog : array-like
        Should preserve the input type of Y,X. Could be None.
    """
    # half ass attempt to handle other formula objects
    if isinstance(formula, tuple(iterkeys(formula_handler))):
        return formula_handler[type(formula)]

    na_action = NAAction(on_NA=missing)

    if X is not None:
        if data_util._is_using_pandas(Y, X):
            result = dmatrices(formula, (Y, X), depth,
                               return_type='dataframe', NA_action=na_action)
        else:
            result = dmatrices(formula, (Y, X), depth,
                               return_type='dataframe', NA_action=na_action)
    else:
        if data_util._is_using_pandas(Y, None):
            result = dmatrices(formula, Y, depth, return_type='dataframe',
                               NA_action=na_action)
        else:
            result = dmatrices(formula, Y, depth, return_type='dataframe',
                               NA_action=na_action)

    # if missing == 'raise' there's not missing_mask
    missing_mask = getattr(na_action, 'missing_mask', None)
    if not np.any(missing_mask):
        missing_mask = None
    if len(result) > 1:  # have RHS design
        design_info = result[1].design_info  # detach it from DataFrame
    else:
        design_info = None
    # NOTE: is there ever a case where we'd need LHS design_info?
    return result, missing_mask, design_info
Ejemplo n.º 5
0
 def gen_predictors(self):
     """Generates predictors data frame"""
     model = read_csv(self.model_file)
     _, predictors = dmatrices(self.formula, model)
     self.di = predictors.design_info
     self.predictors = DataFrame(predictors, columns=self.di.column_name_indexes)
     return self.predictors
Ejemplo n.º 6
0
    def __init__(self, model, *args, lazy=False):
        """
        Initialize a linear model

        Parameters
        ==========
        model: str
            model string
        args: argument list

        Returns
        =======
        No return

        Raises
        ======
        LogicalError
            If model is wrong
        """
        self.modelstr = model
        self.model = dmatrices(model, *args)
        if len(self.model) != 2:
            raise LogicalError("Invalid model specification, should have variables either side")
        if len(self.model[0][1]) != 1: #TODO add support to multiple responses later
            raise LogicalError("Multiple responses regression is not supported")
        self._regress()
Ejemplo n.º 7
0
def vcfassoc(formula, covariate_df, groups=None):

    y, X = patsy.dmatrices(str(formula), covariate_df, return_type='dataframe')
    # get the column containing genotype
    ix = get_genotype_ix(X)
    Binomial = sm.families.Binomial
    logit = sm.families.links.Logit()

    if groups is not None:
        #covariate_df['grps'] = map(str, range(len(covariate_df) / 8)) * 8
        if not isinstance(groups, (pd.DataFrame, np.ndarray)):
            cov = Exchangeable()
            model = sm.GEE(y, X, groups=covariate_df[groups], cov_struct=cov,
                    family=Binomial())
        else:
            model = sm.GLS(logit(y), X, sigma=groups.ix[X.index, X.index])
    else:
        model = sm.GLM(y, X, missing='drop', family=Binomial())

    result = model.fit(maxiter=1000)
    res = {'OR': np.exp(result.params[ix]),
           'pvalue': result.pvalues[ix],
           'z': result.tvalues[ix],
           'OR_CI': tuple(np.exp(result.conf_int().ix[ix, :])),
           }
    try:
        res['df_resid'] = result.df_resid
    except AttributeError:
        pass
    return res
Ejemplo n.º 8
0
def xtab(formula, covariate_df):
    y, X = patsy.dmatrices(str(formula), covariate_df)
    X = patsy.dmatrix('genotype', covariate_df)
    ix = get_genotype_ix(X)

    tbl = pd.crosstab(X[:, ix], y.ravel())
    try:
        tbl.columns = ['%s_%i' % (y.design_info.column_names[-1], j) for j in range(2)]
    except:
        return None # too few samples
    tbl.index = ['%i_alts' % i for i in tbl.index]
    alts = set(tbl.index)
    if len(alts) < 2 or not '0_alts' in alts:
        tbl_dom = None
    else:
        tbl_dom = pd.DataFrame({'0_alts': tbl.ix['0_alts', :], 'n_alts': tbl.ix[list(alts - set(['0_alts'])), :].sum()}).T

    # can't test recessive without any homoz alts.
    if not '2_alts' in alts or len(alts) < 2:
        tbl_rec = None
    else:
        tbl_rec = pd.DataFrame({'lt2_alts': tbl.ix[['0_alts', '1_alts'], :].sum(), '2_alts': tbl.ix['2_alts', :]})

    d = {}
    for name, xtbl in (('additive', tbl), ('dominant', tbl_dom), ('recessive', tbl_rec)):
        if xtbl is None:

            d['p.chi.%s' % name] =  'nan'
            continue

        chi, p, ddof, e = chi2_contingency(xtbl)
        if name == 'additive':
            d = xtbl.to_dict()
        d['p.chi.%s' % name] = "%.3g" % p
    return d
Ejemplo n.º 9
0
def randforpat():
    df = pd.read_csv("train.csv")
    cleanpatsy(df)
    y, X = dmatrices('Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Cabin + Embarked',df, return_type="dataframe")
    y = np.ravel(y)


    forest = RandomForestClassifier(n_estimators=100)
    forest = forest.fit( X,y )
    print forest.score(X, y)



    # # evaluate the model by splitting into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    model2 = RandomForestClassifier(n_estimators = 100)
    model2.fit(X_train, y_train)
    predicted = model2.predict(X_test)
    print metrics.accuracy_score(y_test, predicted)


    dftest = pd.read_csv("test.csv")
    cleanpatsy(dftest)
    X = dmatrix('Pclass + Sex + Age + SibSp + Parch + Fare + Cabin + Embarked',dftest, return_type="dataframe")
    output = forest.predict(X).astype(int)
    result = {'PassengerId':dftest.PassengerId, 'Survived':output}
    dfresult = pd.DataFrame(result)
    dfresult.to_csv("result.csv",index=False)
Ejemplo n.º 10
0
 def from_formula(cls, formula, data, priors=None,
                  vars=None, family='normal', name='', model=None):
     import patsy
     y, x = patsy.dmatrices(formula, data)
     labels = x.design_info.column_names
     return cls(np.asarray(x), np.asarray(y)[:, 0], intercept=False, labels=labels,
                priors=priors, vars=vars, family=family, name=name, model=model)
Ejemplo n.º 11
0
    def predict(self,h=5,oos_data=None):
        """ Makes forecast with the estimated model

        Parameters
        ----------
        h : int (default : 5)
            How many steps ahead would you like to forecast?

        oos_data : pd.DataFrame
            Data to use for the predictors in the forecast

        Returns
        ----------
        - pd.DataFrame with predicted values
        """     

        if self.latent_variables.estimated is False:
            raise Exception("No latent variables estimated!")
        else:
            _, X_oos = dmatrices(self.formula, oos_data)
            X_oos = np.array([X_oos])[0]
            X_pred = X_oos[:h]

            sigma2, Y, scores, _ = self._model(self.latent_variables.get_z_values()) 
            date_index = self.shift_dates(h)
            t_params = self.transform_z()

            mean_values = self._mean_prediction(sigma2,Y,scores,h,t_params,X_pred)
            forecasted_values = mean_values[-h:]
            result = pd.DataFrame(np.exp(forecasted_values/2.0))
            result.rename(columns={0:self.data_name}, inplace=True)
            result.index = date_index[-h:]

            return result
def main():
    train_df_filled=fill_null_vals(train_df,'Fare')
    train_df_filled=fill_null_vals(train_df_filled,'Age')
    assert len(train_df_filled)==len(train_df)
    
    test_df_filled=fill_null_vals(test_df,'Fare')
    test_df_filled=fill_null_vals(test_df_filled,'Age')
    assert len(test_df_filled)==len(test_df)

    for formula_name, formula in formula_map.iteritems():
        print "name=%s formula=%s" % (formula_name,formula)
        y_train,X_train = dmatrices('Survived ~ ' + formula, 
                                    train_df_filled,return_type='dataframe')
        print "Running logistic regression with formula : %s" % formula
        print "X_train cols=%s " % X_train.columns
        y_train = np.ravel(y_train)
        model = LogisticRegression()
        lr_model = model.fit(X_train, y_train)
        print "Training score:%s" % lr_model.score(X_train,y_train)
        X_test=dmatrix(formula,test_df_filled)
        predicted=lr_model.predict(X_test)
        print "predicted:%s\n" % predicted[:5]
        assert len(predicted)==len(test_df)
        pred_results=pd.Series(predicted,name='Survived')
        lr_results=pd.concat([test_df['PassengerId'],pred_results],axis=1)
        lr_results.Survived=lr_results.Survived.astype(int)
        results_file='csv/logisticregr_%s.csv' % formula_name
        #results_file = re.sub('[+ ()C]','',results_file)
        lr_results.to_csv(results_file,index=False)
Ejemplo n.º 13
0
    def __new__(cls, formula, data, priors=None,
            intercept_prior=None,
            regressor_prior=None,
            init_vals=None,
            family='normal',
            model=None,
            name=''):
        _families = dict(
            normal=families.Normal,
            student=families.StudentT,
            binomial=families.Binomial,
            poisson=families.Poisson
        )
        if isinstance(family, str):
            family = _families[family]()

        y_data = np.asarray(patsy.dmatrices(formula, data)[0]).T

        y_est, coeffs = linear_component(
            formula, data, priors=priors,
            intercept_prior=intercept_prior,
            regressor_prior=regressor_prior,
            init_vals=init_vals,
            model=model,
            name=name
            )
        family.create_likelihood(name, y_est, y_data, model=model)

        return super(glm, cls).__new__(cls, y_est, coeffs)
Ejemplo n.º 14
0
    def __setstate__(self, d):
        if "restore_design_info" in d:
            # NOTE: there may be a more performant way to do this
            from patsy import dmatrices, PatsyError
            exc = []
            try:
                data = d['frame']
            except KeyError:
                data = d['orig_endog'].join(d['orig_exog'])

            for depth in [2, 3, 1, 0, 4]:  # sequence is a guess where to likely find it
                try:
                    _, design = dmatrices(d['formula'], data, eval_env=depth,
                                          return_type='dataframe')
                    break
                except (NameError, PatsyError) as e:
                    print('not in depth %d' % depth)
                    exc.append(e)   # why do I need a reference from outside except block
                    pass
            else:
                raise exc[-1]

            self.design_info = design.design_info
            del d["restore_design_info"]
        self.__dict__.update(d)
Ejemplo n.º 15
0
def pse_perSs_perCond(data,combos):
    '''
    Returns dict of PSE per conditions per subject.

    data: recarray of data with trials as level of analysis
    combos: dict of combinations of keys from other columns of data you want PSE per
            keys: label you want per PSE (probably of combinations of keys/conditions)
            values: dict of column names (keys) and values (values)
    '''

    ssdat = []
    for s in np.unique(data['subjid']):
        for c,combo in combos.iteritems():

            #slice data
            slicer = data['subjid'] == s
            for col in combo:
                slicer *= data[col]==combo[col]            
            dsliced = data[slicer]
            # Prepare the data
            file = pd.DataFrame({'non2targ': list(dsliced['non2targ']) , 'morph':list(dsliced['morph']-6)})
            y,X = dmatrices('non2targ ~ morph',file)
            y = np.ravel(y)
            # Fit the data to Logistic Regression model
            model = LogisticRegression()
            model = model.fit(X,y)
            pse = -1 * (model.coef_[0][0]/model.coef_[0][1])
            if np.isfinite(pse)==False or np.isnan(pse)==True:
                raise NameError('NaN or nonfinite return')
            #ssdat[s][c] = pse
            ssdat.append([s,c,float(pse)])
           
    return ssdat
Ejemplo n.º 16
0
    def __init__(self,data,p,q,formula):

        # Initialize TSM object
        super(EGARCHMReg,self).__init__('EGARCHMReg')

        # Latent variables
        self.p = p
        self.q = q
        self.max_lag = max(self.p,self.q)  
        self.z_no = self.p + self.q + 2
        self._z_hide = 0 # Whether to cutoff variance latent variables from results
        self.supported_methods = ["MLE","PML","Laplace","M-H","BBVI"]
        self.default_method = "MLE"
        self.multivariate_model = False
        self.leverage = False
        self.model_name = "EGARCHMReg(" + str(self.p) + "," + str(self.q) + ")"

        # Format the data
        self.is_pandas = True # This is compulsory for this model type
        self.data_original = data
        self.formula = formula
        self.y, self.X = dmatrices(formula, data)
        self.z_no += self.X.shape[1]*2
        self.y_name = self.y.design_info.describe()
        self.data_name = self.y_name
        self.X_names = self.X.design_info.describe().split(" + ")
        self.y = np.array([self.y]).ravel()
        self.data = self.y
        self.X = np.array([self.X])[0]
        self.index = data.index
        self.initial_values = np.zeros(self.z_no)

        self._create_latent_variables()
Ejemplo n.º 17
0
def main():
    train_df_filled=fill_null_vals(train_df,'Fare')
    train_df_filled=fill_null_vals(train_df_filled,'Age')
    assert len(train_df_filled)==len(train_df)
    
    test_df_filled=fill_null_vals(test_df,'Fare')
    test_df_filled=fill_null_vals(test_df_filled,'Age')
    assert len(test_df_filled)==len(test_df)

    
    num_estimators=10000
    for formula_name, formula in formula_map.iteritems():

        print "name=%s formula=%s" % (formula_name,formula)

        y_train,X_train = dmatrices('Survived ~ ' + formula, 
                                    train_df_filled,return_type='dataframe')
        print "Running RandomForestClassifier with formula : %s" % formula
        print "X_train cols=%s " % X_train.columns
        y_train = np.ravel(y_train)
        model = RandomForestClassifier(n_estimators=num_estimators, random_state=0)
        print "About to fit..."
        rf_model = model.fit(X_train, y_train)
        print "Training score:%s" % rf_model.score(X_train,y_train)
        X_test=dmatrix(formula,test_df_filled)
        predicted=rf_model.predict(X_test)
        print "predicted:%s" % predicted[:5]
        assert len(predicted)==len(test_df)
        pred_results=pd.Series(predicted,name='Survived')
        rf_results=pd.concat([test_df['PassengerId'],pred_results],axis=1)
        rf_results.Survived=rf_results.Survived.astype(int)
        results_file='csv/rf_%s_n_est_%s.csv' % (formula_name,num_estimators)
        print "output file: %s\n" % results_file
        #results_file = re.sub('[+ ()C]','',results_file)
        rf_results.to_csv(results_file,index=False)
Ejemplo n.º 18
0
    def predict(self,h=5,oos_data=None):
        """ Makes forecast with the estimated model

        Parameters
        ----------
        h : int (default : 5)
            How many steps ahead would you like to forecast?

        oos_data : pd.DataFrame
            Data for the variables to be used out of sample (ys can be NaNs)

        Returns
        ----------
        - pd.DataFrame with predicted values
        """     

        if self.parameters.estimated is False:
            raise Exception("No parameters estimated!")
        else:
            # Sort/manipulate the out-of-sample data
            _, X_oos = dmatrices(self.formula, oos_data)
            X_oos = np.array([X_oos])[0]
            X_pred = X_oos[:h]
            mu, Y = self._model(self.parameters.get_parameter_values())         
            date_index = self.shift_dates(h)
            t_params = self.transform_parameters()

            mean_values = self._mean_prediction(mu,Y,h,t_params,X_pred)
            forecasted_values = mean_values[-h:]
            result = pd.DataFrame(forecasted_values)
            result.rename(columns={0:self.data_name}, inplace=True)
            result.index = date_index[-h:]

            return result
Ejemplo n.º 19
0
    def __init__(self,formula,data):

        # Initialize TSM object
        super(DynLin,self).__init__('DynLin')

        # Parameters
        self.max_lag = 0
        self._param_hide = 0 # Whether to cutoff variance parameters from results
        self.supported_methods = ["MLE","PML","Laplace","M-H","BBVI"]
        self.default_method = "MLE"
        self.model_name = "Dynamic Linear Regression"
        self.multivariate_model = False

        # Format the data
        self.is_pandas = True # This is compulsory for this model type
        self.data_original = data
        self.formula = formula
        self.y, self.X = dmatrices(formula, data)
        self.param_no = self.X.shape[1] + 1
        self.y_name = self.y.design_info.describe()
        self.data_name = self.y_name
        self.X_names = self.X.design_info.describe().split(" + ")
        self.y = np.array([self.y]).ravel()
        self.data = self.y
        self.X = np.array([self.X])[0]
        self.index = data.index

        self._create_parameters()
Ejemplo n.º 20
0
def main():
    train_df_filled=fill_null_vals(train_df,'Fare')
    train_df_filled=fill_null_vals(train_df_filled,'Age')
    assert len(train_df_filled)==len(train_df)
    
    test_df_filled=fill_null_vals(test_df,'Fare')
    test_df_filled=fill_null_vals(test_df_filled,'Age')
    assert len(test_df_filled)==len(test_df)

    

    for formula_name, formula in formula_map.iteritems():

        print "name=%s formula=%s" % (formula_name,formula)

        y_train,X_train = dmatrices('Survived ~ ' + formula, 
                                    train_df_filled,return_type='dataframe')
        print "Running DecisionTreeClassifier with formula : %s" % formula
        print "X_train cols=%s " % X_train.columns
        y_train = np.ravel(y_train)
        model = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3,min_samples_leaf=5)
        print "About to fit..."
        dt_model = model.fit(X_train, y_train)
        print "Training score:%s" % dt_model.score(X_train,y_train)
        X_test=dmatrix(formula,test_df_filled)
        predicted=dt_model.predict(X_test)
        print "predicted:%s" % predicted[:5]
        assert len(predicted)==len(test_df)
        pred_results=pd.Series(predicted,name='Survived')
        dt_results=pd.concat([test_df['PassengerId'],pred_results],axis=1)
        dt_results.Survived=dt_results.Survived.astype(int)
        results_file='csv/dt_%s.csv' % (formula_name)
        print "output file: %s\n" % results_file
        #results_file = re.sub('[+ ()C]','',results_file)
        dt_results.to_csv(results_file,index=False)
Ejemplo n.º 21
0
def pandashandler(formula_like, data):
    """
    process a pysal model signature and convert an equation/formula pair into a
    pysal-specific object
    """
    if '||' in formula_like:
        mu, inst = formula_like.split('||')
        y, X = p.dmatrices(mu + '-1' , data=data)
        yend, q  = p.dmatrices(inst + '-1', data=data)
        rargs = [y,X,yend,q]
        rargs = [asarray(i) for i in rargs]
    else:
        y, X = p.dmatrices(formula_like + '-1', data=data)
        rargs = [asarray(y), asarray(X)]

    return rargs
Ejemplo n.º 22
0
def logistic_regression(data):
    y, X = dmatrices(LR_FORMULA, data)
    y = np.ravel(y)
    model = LogisticRegression(penalty='l1', C=0.1, fit_intercept=True)
    model = model.fit(X, y)
    print model.score(X, y)
    return model
Ejemplo n.º 23
0
def ready_for_model(df):
    cols = list(df.columns.values)

    # keep columns
    cols_keep = []
    cols_giveup = []
    for c in df:
        if df[c].dtype in [int, float]:
            cols_keep.append(c)
        elif df[c].dtype == object:
            if df[c].nunique() < 25:
                cols_keep.append(c)
            else:
                cols_giveup.append(c)

    # remove the labels
    for to_remove in ['id', 'status', 'status_group']:
        cols_keep.remove(to_remove)

    # convert df to X, y by patsy
    r_formula = 'status ~' + ' + '.join(cols_keep)
    df_y, df_X = patsy.dmatrices(r_formula, df, return_type='dataframe')

    cols_X = df_X.columns
    X = df_X.values
    y = df_y.values
    return (X, y, cols_X, r_formula, cols_keep, cols_giveup)
Ejemplo n.º 24
0
def logisticpatsy():
    df = pd.read_csv("train.csv")
    cleanpatsy(df)
    #y, X = dmatrices('Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Cabin + Embarked',df, return_type="dataframe")
    y, X = dmatrices('Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Cabin + Embarked',df, return_type="dataframe")
    y = np.ravel(y)

    model = LogisticRegression()
    model = model.fit(X, y)

    # check the accuracy on the training set
    print model.score(X, y)





    # # evaluate the model by splitting into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    model2 = LogisticRegression()
    model2.fit(X_train, y_train)
    predicted = model2.predict(X_test)
    print metrics.accuracy_score(y_test, predicted)


    dftest = pd.read_csv("test.csv")
    cleanpatsy(dftest)
    X = dmatrix('Pclass + Sex + Age + SibSp + Parch + Fare + Cabin + Embarked',dftest, return_type="dataframe")
    predict_survive = model.predict(X)
    result = {'PassengerId':dftest.PassengerId, 'Survived':predict_survive}
    dfresult = pd.DataFrame(result)
    dfresult.to_csv("result.csv",index=False)
    print pd.DataFrame(zip(X.columns, np.transpose(model.coef_)))
Ejemplo n.º 25
0
def nominal_logistic_regression():
    '''Nominal Logistic Regression
    chapter 8.3,  p. 155 
    
    At this point, nominal logistic regression cannot be done with the formula approach.
    
    Regarding the output, note that R produces log(pi2/pi1) and log(pi3/pi1), while
    statsmodels produces log(pi2/pi1) and log(pi3/pi2) 
    '''
    
    # Get the data
    inFile = r'GLM_data/Table 8.1 Car preferences.xls'
    df = get_data(inFile)    

    # to make sure that "women" and "no/little" are the reference,
    # adjust them such that they come first alphabetically
    df['response'][df['response'] == 'no/little'] = '_no/little'
    df['sex'][df['sex'] == 'women'] = '_women'
    print df
    
    
    # Generate the design matrices using patsy
    pm = patsy.dmatrices('response~sex+age', data=df)
    
    # Generate the endog and exog matrices
    endog = np.repeat(np.array(df['response']), df['frequency'].values.astype(int), axis=0)
    exog = np.array(np.repeat(pm[1], df['frequency'].values.astype(int), axis=0))
    exog = pd.DataFrame(exog, columns=pm[1].design_info.column_names) 

    # Fit the model, and print the summary
    model = sm.MNLogit(endog, exog, method='nm').fit()
    print  model.summary()
Ejemplo n.º 26
0
    def predict(self,h=5,oos_data=None):
        """ Makes forecast with the estimated model

        Parameters
        ----------
        h : int (default : 5)
            How many steps ahead would you like to forecast?

        oos_data : pd.DataFrame
            Data for the variables to be used out of sample (ys can be NaNs)

        Returns
        ----------
        - pd.DataFrame with predicted values
        """     

        if self.parameters.estimated is False:
            raise Exception("No parameters estimated!")
        else:

            # Sort/manipulate the out-of-sample data
            _, X_oos = dmatrices(self.formula, oos_data)
            X_oos = np.array([X_oos])[0]
            X_pred = X_oos[:h]
            date_index = self.shift_dates(h)
            _, _, _, coefficients = self._model(self.parameters.get_parameter_values()) 
            coefficients_star = coefficients.T[-1]
            theta_pred = np.dot(np.array([coefficients_star]), X_pred.T)[0]

            result = pd.DataFrame(self.link(theta_pred))
            result.rename(columns={0:self.y_name}, inplace=True)
            result.index = date_index[-h:]

            return result
def test_from_formula_vs_no_formula():
    mod = _MultivariateOLS.from_formula("Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted", data)
    r = mod.fit(method="svd")
    r0 = r.mv_test()
    endog, exog = patsy.dmatrices(
        "Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted", data, return_type="dataframe"
    )
    L = np.array([[1, 0, 0, 0, 0, 0]])
    # DataFrame input
    r = _MultivariateOLS(endog, exog).fit(method="svd")
    r1 = r.mv_test(hypotheses=[["Intercept", L, None]])
    assert_array_almost_equal(r1["Intercept"]["stat"].values, r0["Intercept"]["stat"].values, decimal=6)
    # Numpy array input
    r = _MultivariateOLS(endog.values, exog.values).fit(method="svd")
    r1 = r.mv_test(hypotheses=[["Intercept", L, None]])
    assert_array_almost_equal(r1["Intercept"]["stat"].values, r0["Intercept"]["stat"].values, decimal=6)
    L = np.array([[0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0]])
    r1 = r.mv_test(hypotheses=[["Drug", L, None]])
    # DataFrame input
    r = _MultivariateOLS(endog, exog).fit(method="svd")
    r1 = r.mv_test(hypotheses=[["Drug", L, None]])
    assert_array_almost_equal(r1["Drug"]["stat"].values, r0["Drug"]["stat"].values, decimal=6)
    # Numpy array input
    r = _MultivariateOLS(endog.values, exog.values).fit(method="svd")
    r1 = r.mv_test(hypotheses=[["Drug", L, None]])
    assert_array_almost_equal(r1["Drug"]["stat"].values, r0["Drug"]["stat"].values, decimal=6)
Ejemplo n.º 28
0
def generate_data(n, loss, wt_param=2, return_rate=False):
    """
    Generate random data for testing
    :param n:
    :param loss:
    :param wt_param:
    :return:
    """
    w = np.random.randint(1, wt_param, n)
    if loss == 'squared':
        y = np.random.normal(50, 100, size=n)
    if loss == 'logistic':
        # Binomial - n numper of trials, p probability [p_or_label trials] - response
        if return_rate:
            y = [sum(np.random.binomial(1, 0.1, 100) == 1) / 100.0 for _ in xrange(n)]
        else:
            y = np.random.binomial(1, 0.1, n)
    if loss == 'poisson':
        # Poisson - lambda expected events in an interval [avg_no_of_events_or_rate trials] - response
        if return_rate:
            y = [count*1.0/trials for count, trials in zip(np.random.poisson(10, size=n), w)]
        else:
            y = np.random.poisson(10, size=n)

    d = {'value': y,
         'feature1': [np.random.choice(['a', 'b', 'c']) for _ in xrange(n)],
         'feature2': [np.random.choice(['pp', 'qq']) for _ in xrange(n)]}

    df = pd.DataFrame(d)
    out = ptsy.dmatrices('value ~ feature1 + feature2', data=df, return_type='dataframe')
    y, X = out
    return w, y, X
    def __init__(self,data,eqn,**kwargs):
        
        self.dmatrices=patsy.dmatrices(eqn, data)
        self.eqn=eqn
        
        self.y=np.array(self.dmatrices[0])
        self.X=np.array(self.dmatrices[1])
        self.column_names=self.dmatrices[1].design_info.column_names
        
            
        MCMCModel_Meta.__init__(self,**kwargs)

        self.index={}
        self.keys=[]
        self.params={}
        count=0
        for paramname in ['beta_%d' % _ for _ in range(len(self.column_names))]:
            if paramname in kwargs:
                self.params[paramname]=kwargs[paramname]
            else:
                self.params[paramname]=Normal(0,10)
            self.index[paramname]=count
            self.keys.append(paramname)
            count+=1
        
        if 'sigma' in kwargs:
            self.params['_sigma']=kwargs['sigma']
        else:
            self.params['_sigma']=Jeffries()

        self.keys.append('_sigma')        
        self.index['_sigma']=len(self.keys)-1
Ejemplo n.º 30
0
def main():

    fname = "loans_imputed.csv"
    df = pd.read_csv(fname)

    #print df.describe()
    df.hist()
    plt.show()

    # clean up the dataframe
    df.rename(columns={'not.fully.paid': 'not_fully_paid',
                       'credit.policy': 'credit_policy',
                       'int.rate': 'int_rate',
                       'log.annual.inc': 'log_annual_inc',
                       'days.with.cr.line': 'days_with_cr_line',
                       'revol.bal': 'revol_bal',
                       'inq.last.6mths': 'inq_last_6mths',
                       'delinq.2yrs': 'delinq_2yrs',
                       'pub.rec': 'pub_rec'}, inplace=True)

    y, X = dmatrices('not_fully_paid ~ credit_policy + int_rate + \
                     installment + log_annual_inc + dti + \
                     days_with_cr_line + revol_bal + inq_last_6mths + \
                     delinq_2yrs + pub_rec',
                     df, return_type='dataframe')

    model = LogisticRegression()
    model.fit(X, y)
    predict = model.predict(X)

    print
    print
    print 'Model accuracy: %f' % (model.score(X, y) * 100.0)
    print pd.DataFrame(zip(X.columns, np.transpose(model.coef_)))
Ejemplo n.º 31
0
v = np.random.normal(0, var_v, n)**3

#create a pandas dataframe (easily parseable object for manipulation)
A = pd.DataFrame({'x': x, 'z': z, 'v': v})
#compute the log odds for our 3 independent variables
#using the sigmoid function
A['log_odds'] = sigmoid(A[['x', 'z', 'v']].dot([beta_x, beta_z, beta_v]) +
                        sigma * np.random.normal(0, 1, n))

#compute the probability sample from binomial distribution
#A binomial random variable is the number of successes x has in n repeated trials of a binomial experiment.
#The probability distribution of a binomial random variable is called a binomial distribution.
A['y'] = [np.random.binomial(1, p) for p in A.log_odds]

#create a dataframe that encompasses our input data, model formula, and outputs
y, X = dmatrices(formula, A, return_type='dataframe')

#print it
X.head(100)


#like dividing by zero (Wtff omgggggg universe collapses)
def catch_singularity(f):
    '''Silences LinAlg Errors and throws a warning instead.'''
    def silencer(*args, **kwargs):
        try:
            return f(*args, **kwargs)
        except np.linalg.LinAlgError:
            warnings.warn('Algorithm terminated - singular Hessian!')
            return args[0]
Ejemplo n.º 32
0
    def ModelLogisticReg(self):
        print '+++++++++++++++++++++++++ LOGISTIC REGRESSION 1 +++++++++++++++++++++++++'

        # Read csv file.. First get handle
        comLRHandle = ReadCSV.Read_CSV()
        data = comLRHandle.Read(self.path)

        # Let count the data and check if any missing info/value
        #print data.count(0)

        #PassengerId    891
        #Survived       891
        #Pclass         891
        #Name           891
        #Sex            891
        #Age            714
        #SibSp          891
        #Parch          891
        #Ticket         891
        #Fare           891
        #Cabin          204
        #Embarked       889

        # We need to remove Name, Cabin and Ticket because these are not useful
        data = data.drop(['Ticket', 'Cabin', 'Name'], axis=1)

        # Drop Na also.. We may fill them with avg/some other method.. but lets drop for now
        data = data.dropna()

        #We'll use a Python package called Patsy, which helps in describing statistical models.
        #It helps in defining a dependent and independent variable formula that is similar to
        #R. The variable that is defined left of '~' is the dependent variable, and the variable
        #that is defined to right of it are the independent variables. The variables enclosed
        #within C() are treated as categorical variables.

        formula = 'Survived ~ C(Pclass) + C(Sex) + Age + SibSp + C(Embarked) + Parch'

        # Lets create a dictionary to hold regression result for easy analysis
        DFTdataX = data.iloc[0:600, :]  # take first 600 samples (For Training)
        DFVdataX = data.iloc[600:, :]  # take remaining samples (For Testing)

        # Splitting the data into dependent and independent variables
        TdataY, TdataX = patsy.dmatrices(formula,
                                         data=DFTdataX,
                                         return_type='dataframe')
        VdataY, VdataX = patsy.dmatrices(formula,
                                         data=DFVdataX,
                                         return_type='dataframe')

        # Let instantiate out model using stats package
        LogistModel = sm.Logit(TdataY, TdataX)

        # Execute model to let it fit
        ResLogModel = LogistModel.fit()

        #print ResLogModel.summary()

        #Logit Regression Results
        #==============================================================================
        #Dep. Variable:               Survived   No. Observations:                  600
        #Model:                          Logit   Df Residuals:                      591
        #Method:                           MLE   Df Model:                            8
        #Date:                Fri, 13 Nov 2015   Pseudo R-squ.:                  0.3333
        #Time:                        22:39:44   Log-Likelihood:                -270.02
        #converged:                       True   LL-Null:                       -404.99
        #LLR p-value:                 1.009e-53
        #====================================================================================
        #                       coef    std err          z      P>|z|      [95.0% Conf. Int.]
        #------------------------------------------------------------------------------------
        #-Intercept            4.3332      0.510      8.490      0.000         3.333     5.334
        #-C(Pclass)[T.2]      -1.2030      0.325     -3.703      0.000        -1.840    -0.566
        #-C(Pclass)[T.3]      -2.4673      0.320     -7.705      0.000        -3.095    -1.840
        #-C(Sex)[T.male]      -2.6312      0.244    -10.797      0.000        -3.109    -2.154
        #+C(Embarked)[T.Q]    -0.4359      0.647     -0.674      0.501        -1.704     0.832
        #+C(Embarked)[T.S]    -0.2910      0.297     -0.980      0.327        -0.873     0.291
        #-Age                 -0.0397      0.009     -4.464      0.000        -0.057    -0.022
        #-SibSp               -0.3202      0.136     -2.354      0.019        -0.587    -0.054
        #+Parch               -0.1420      0.136     -1.041      0.298        -0.409     0.125
        #====================================================================================
        # As we can see Psudo R-Squ.=0.333, it is good.. any error between 0.2-0.4 is OK
        # As we can see also Embarktion and Parch has P>0.050 these people don't have much
        # significance over predication. Well, we offcourse want few predictors, let re-design
        # our formula and see what happens

        # Lets update formula again.. removing Embarked and Parch
        formula = 'Survived ~ C(Pclass) + C(Sex) + Age + SibSp '

        # Splitting the data into dependent and independent variables
        TdataY, TdataX = patsy.dmatrices(formula,
                                         data=DFTdataX,
                                         return_type='dataframe')
        VdataY, VdataX = patsy.dmatrices(formula,
                                         data=DFVdataX,
                                         return_type='dataframe')

        # Let instantiate out model using stats package
        LogistModel = sm.Logit(TdataY, TdataX)

        # Execute model to let it fit
        ResLogModel = LogistModel.fit()

        #print ResLogModel.summary()

        #Logit Regression Results
        #==============================================================================
        #Dep. Variable:               Survived   No. Observations:                  600
        #Model:                          Logit   Df Residuals:                      594
        #Method:                           MLE   Df Model:                            5
        #Date:                Sun, 15 Nov 2015   Pseudo R-squ.:                  0.3307
        #Time:                        12:26:13   Log-Likelihood:                -271.08
        #converged:                       True   LL-Null:                       -404.99
        #                                        LLR p-value:                 8.172e-56
        #==================================================================================
        #                     coef    std err          z      P>|z|      [95.0% Conf. Int.]
        #----------------------------------------------------------------------------------
        #-Intercept          4.1050      0.479      8.575      0.000         3.167     5.043
        #-C(Pclass)[T.2]    -1.2971      0.306     -4.242      0.000        -1.896    -0.698
        #-C(Pclass)[T.3]    -2.5739      0.305     -8.433      0.000        -3.172    -1.976
        #-C(Sex)[T.male]    -2.5808      0.235    -10.996      0.000        -3.041    -2.121
        #-Age               -0.0401      0.009     -4.549      0.000        -0.057    -0.023
        #-SibSp             -0.3691      0.130     -2.840      0.005        -0.624    -0.114
        #==================================================================================
        # We can see that all the predictors are significant in the preceding model.

        # Let evaluate the model and see how good it works with validation/testing data

        # We will use Kernel Density Estimation
        kde_res = sm.nonparametric.KDEUnivariate(ResLogModel.predict())
        kde_res.fit()
        #plt.plot(kde_res.support,kde_res.density)
        #plt.fill_between(kde_res.support,kde_res.density, alpha=0.2)
        #plt.title("Distribution of our Predictions")
        #plt.show()
        # From image we can see most of the distribution (highest density) is over 0. That means
        # most of the people had died. This is true in case of titanic dataset.

        # Let's see the prediction distribution based on the male gender:
        #plt.scatter(ResLogModel.predict(),TdataX['C(Sex)[T.male]'] , alpha=0.2)
        #plt.grid(b=True, which='major', axis='x')
        #plt.xlabel("Predicted chance of survival")
        #plt.ylabel("Male Gender")
        #plt.title("The Change of Survival Probability by Gender being Male")
        #plt.show()
        # As we can see from image, probability of survival is high for female compare to male.

        # Now, let's see the distribution of the prediction based on the lower class of the passengers:
        #plt.scatter(ResLogModel.predict(),TdataX['C(Pclass)[T.3]'] , alpha=0.2)
        #plt.xlabel("Predicted chance of survival")
        #plt.ylabel("Class Bool") # Boolean class to show if its 3rd class
        #plt.grid(b=True, which='major', axis='x')
        #plt.title("The Change of Survival Probability by Lower Class which is 3rd class")
        #plt.show()
        # We can see from image, lower class people has lower chance of survival compare to uper class.
        # More money can save you...

        # Let's see the distribution of the probability with respect to the age of the passengers:
        #plt.scatter(ResLogModel.predict(),TdataX.Age , alpha=0.2)
        #plt.grid(True, linewidth=0.15)
        #plt.title("The Change of Survival Probability by Age")
        #plt.xlabel("Predicted chance of survival")
        #plt.ylabel("Age")
        #plt.show()
        # If we see the graph. There are two outcomes...
        # 1. Small children of age around 0-1 year has predicted chance of survival spread over full range.
        # 2. As the age increase, chance of survival go to left of graph which is less chance of survival.
        # but this graph is distribution over wide range of Age unlike above 2 graphs (binary).

        # Let's see the distribution of the probability with respect to the number of siblings/spouses:
        #plt.scatter(ResLogModel.predict(),TdataX.SibSp , alpha=0.2)
        #plt.grid(True, linewidth=0.15)
        #plt.title("The Change of Survival Probability by Number of siblings/spouses")
        #plt.xlabel("Predicted chance of survival")
        #plt.ylabel("No. of Siblings/Spouses")
        #plt.show()
        # Less the family member on board.. more the chances of survival.

        ## Evaluating a model based on test data ##
        y_pred = ResLogModel.predict(VdataX)
        y_pred_flag = y_pred > 0.7
        print '------------------------------------------------------------------------------------------'
        print pd.crosstab(VdataY.Survived,
                          y_pred_flag,
                          rownames=['Actual'],
                          colnames=['Predicted'])
        print '------------------------------------------------------------------------------------------'
        print classification_report(VdataY, y_pred_flag)
        print '------------------------------------------------------------------------------------------'
Ejemplo n.º 33
0
    def predict(self, h=5, intervals=False, oos_data=None, **kwargs):        
        """ Makes forecast with the estimated model

        Parameters
        ----------
        h : int (default : 5)
            How many steps ahead would you like to forecast?

        intervals : boolean (default: False)
            Whether to return prediction intervals

        oos_data : pd.DataFrame
            Data for the variables to be used out of sample (ys can be NaNs)

        Returns
        ----------
        - pd.DataFrame with predictions
        """     

        nsims = kwargs.get('nsims', 200)

        if self.latent_variables.estimated is False:
            raise Exception("No latent variables estimated!")
        else:

            _, X_oos = dmatrices(self.formula, oos_data)
            X_oos = np.array([X_oos])[0]
            full_X = self.X.copy()
            full_X = np.append(full_X,X_oos,axis=0)
            Z = full_X
            date_index = self.shift_dates(h)

            # Retrieve data, dates and (transformed) latent variables   
            if self.latent_variables.estimation_method in ['M-H']:
                lower_1_final = 0
                upper_99_final = 0
                lower_5_final = 0
                upper_95_final = 0
                forecasted_values_final = 0

                for i in range(nsims):
                    t_params = self.draw_latent_variables(nsims=1).T[0]
                    a, P = self._forecast_model(t_params, Z, h)

                    smoothed_series = np.zeros(h)
                    series_variance = np.zeros(h)

                    for t in range(h):
                        smoothed_series[t] = np.dot(Z[self.y.shape[0]+t],a[:,self.y.shape[0]+t])
                        series_variance[t] = np.dot(np.dot(Z[self.y.shape[0]+t],P[:,:,self.y.shape[0]+t]),Z[self.y.shape[0]+t].T)

                    forecasted_values = smoothed_series
                    lower_5 = smoothed_series - 1.96*np.power(P[0][0][-h:] + self.latent_variables.z_list[0].prior.transform(t_params[0]),0.5)
                    upper_95 = smoothed_series + 1.96*np.power(P[0][0][-h:] + self.latent_variables.z_list[0].prior.transform(t_params[0]),0.5)
                    lower_5_final += lower_5
                    upper_95_final += upper_95
                    lower_1 = smoothed_series - 2.575*np.power(P[0][0][-h:] + self.latent_variables.z_list[0].prior.transform(t_params[0]),0.5)
                    upper_99 = smoothed_series + 2.575*np.power(P[0][0][-h:] + self.latent_variables.z_list[0].prior.transform(t_params[0]),0.5)
                    lower_1_final += lower_1
                    upper_99_final += upper_99
                    forecasted_values_final += forecasted_values

                forecasted_values_final = forecasted_values_final / nsims
                lower_1_final = lower_1_final / nsims
                lower_5_final = lower_5_final / nsims
                upper_95_final = upper_95_final / nsims
                upper_99_final = upper_99_final / nsims

                if intervals is False:
                    result = pd.DataFrame(forecasted_values_final)
                    result.rename(columns={0:self.data_name}, inplace=True)
                else:
                    prediction_05 = lower_5_final
                    prediction_95 = upper_95_final
                    prediction_01 = lower_1_final
                    prediction_99 = upper_99_final

                    result = pd.DataFrame([forecasted_values_final, prediction_01, prediction_05, 
                        prediction_95, prediction_99]).T
                    result.rename(columns={0:self.data_name, 1: "1% Prediction Interval", 
                        2: "5% Prediction Interval", 3: "95% Prediction Interval", 4: "99% Prediction Interval"}, 
                        inplace=True)

                result.index = date_index[-h:]

                return result
     
            else:
                t_params = self.latent_variables.get_z_values()
                a, P = self._forecast_model(t_params, Z, h)
                smoothed_series = np.zeros(h)

                for t in range(h):
                    smoothed_series[t] = np.dot(Z[self.y.shape[0]+t],a[:,self.y.shape[0]+t])

                # Retrieve data, dates and (transformed) latent variables         
                forecasted_values = smoothed_series

                if intervals is False:
                    result = pd.DataFrame(forecasted_values)
                    result.rename(columns={0:self.data_name}, inplace=True)
                else:

                    series_variance = np.zeros(h)

                    for t in range(h):
                        series_variance[t] = np.dot(np.dot(Z[self.y.shape[0]+t],P[:,:,self.y.shape[0]+t]),Z[self.y.shape[0]+t].T)

                    prediction_05 = forecasted_values - 1.96*np.power(P[0][0][-h:] + self.latent_variables.z_list[0].prior.transform(self.latent_variables.get_z_values()[0]),0.5)
                    prediction_95 = forecasted_values + 1.96*np.power(P[0][0][-h:] + self.latent_variables.z_list[0].prior.transform(self.latent_variables.get_z_values()[0]),0.5)
                    prediction_01 = forecasted_values - 2.575*np.power(P[0][0][-h:] + self.latent_variables.z_list[0].prior.transform(self.latent_variables.get_z_values()[0]),0.5)
                    prediction_99 = forecasted_values + 2.575*np.power(P[0][0][-h:] + self.latent_variables.z_list[0].prior.transform(self.latent_variables.get_z_values()[0]),0.5)

                    result = pd.DataFrame([forecasted_values, prediction_01, prediction_05, 
                        prediction_95, prediction_99]).T
                    result.rename(columns={0:self.data_name, 1: "1% Prediction Interval", 
                        2: "5% Prediction Interval", 3: "95% Prediction Interval", 4: "99% Prediction Interval"}, 
                        inplace=True)
     
                result.index = date_index[-h:]

                return result
Ejemplo n.º 34
0
    def plot_predict(self, h=5, past_values=20, intervals=True, oos_data=None, **kwargs):        
        """ Makes forecast with the estimated model

        Parameters
        ----------
        h : int (default : 5)
            How many steps ahead would you like to forecast?

        past_values : int (default : 20)
            How many past observations to show on the forecast graph?

        intervals : Boolean
            Would you like to show 95% prediction intervals for the forecast?

        oos_data : pd.DataFrame
            Data for the variables to be used out of sample (ys can be NaNs)

        Returns
        ----------
        - Plot of the forecast
        """
        import matplotlib.pyplot as plt
        import seaborn as sns

        figsize = kwargs.get('figsize',(10,7))
        nsims = kwargs.get('nsims', 200)

        if self.latent_variables.estimated is False:
            raise Exception("No latent variables estimated!")
        else:

            _, X_oos = dmatrices(self.formula, oos_data)
            X_oos = np.array([X_oos])[0]
            full_X = self.X.copy()
            full_X = np.append(full_X,X_oos,axis=0)
            Z = full_X
            date_index = self.shift_dates(h)

            # Retrieve data, dates and (transformed) latent variables   
            if self.latent_variables.estimation_method in ['M-H']:
                lower_final = 0
                upper_final = 0
                plot_values_final = 0
                plot_index = date_index[-h-past_values:]

                for i in range(nsims):

                    t_params = self.draw_latent_variables(nsims=1).T[0]
                    a, P = self._forecast_model(t_params, Z, h)

                    smoothed_series = np.zeros(self.y.shape[0]+h)
                    series_variance = np.zeros(self.y.shape[0]+h)

                    for t in range(self.y.shape[0]+h):
                        smoothed_series[t] = np.dot(Z[t],a[:,t])
                        series_variance[t] = np.dot(np.dot(Z[t],P[:,:,t]),Z[t].T)

                    plot_values = smoothed_series[-h-past_values:]
                    lower = smoothed_series[-h:] - 1.96*np.power(P[0][0][-h:] + self.latent_variables.z_list[0].prior.transform(t_params[0]),0.5)
                    upper = smoothed_series[-h:] + 1.96*np.power(P[0][0][-h:] + self.latent_variables.z_list[0].prior.transform(t_params[0]),0.5)
                    lower_final += np.append(plot_values[-h-1], lower)
                    upper_final += np.append(plot_values[-h-1], upper)
                    plot_values_final += plot_values

                plot_values_final = plot_values_final / nsims
                lower_final = lower_final / nsims
                upper_final = upper_final / nsims

                plt.figure(figsize=figsize)
                if intervals == True:
                    plt.fill_between(date_index[-h-1:], lower_final, upper_final, alpha=0.2)            

                plt.plot(plot_index, plot_values_final)
                plt.title("Forecast for " + self.data_name)
                plt.xlabel("Time")
                plt.ylabel(self.data_name)
                plt.show()
            else:
                a, P = self._forecast_model(self.latent_variables.get_z_values(), h)
                plot_values = a[0][-h-past_values:]
                forecasted_values = a[0][-h:]

                smoothed_series = np.zeros(self.y.shape[0]+h)
                series_variance = np.zeros(self.y.shape[0]+h)

                for t in range(self.y.shape[0]+h):
                    smoothed_series[t] = np.dot(Z[t],a[:,t])
                    series_variance[t] = np.dot(np.dot(Z[t],P[:,:,t]),Z[t].T)

                lower = forecasted_values - 1.96*np.power(P[0][0][-h:] + self.latent_variables.z_list[0].prior.transform(self.latent_variables.get_z_values()[0]),0.5)
                upper = forecasted_values + 1.96*np.power(P[0][0][-h:] + self.latent_variables.z_list[0].prior.transform(self.latent_variables.get_z_values()[0]),0.5)
                lower = np.append(plot_values[-h-1],lower)
                upper = np.append(plot_values[-h-1],upper)

                plot_index = date_index[-h-past_values:]

                plt.figure(figsize=figsize)
                if intervals == True:
                    plt.fill_between(date_index[-h-1:], lower, upper, alpha=0.2)            

                plt.plot(plot_index,plot_values)
                plt.title("Forecast for " + self.data_name)
                plt.xlabel("Time")
                plt.ylabel(self.data_name)
                plt.show()
foo =  pd.read_csv("./data_vectorised/all_predictors_improved.csv")
foo['RESULT'] = Series(data['type'], index=foo.index)
foo['ID'] = Series(data['id'], index=foo.index)
foo.to_csv('./data_vectorised/reducedVectorised.csv',sep=',', index=False)



# # Logistic Regression

# In[55]:

data2 = pd.read_csv("./data_vectorised/reducedVectorised.csv")


# In[57]:

y, X = dmatrices("RESULT ~ flu + gett + im + shot + think + have + sick + feel + am + you + got + bett + worried + hope + today + vaccine + scared + week + has + back + home + might + worse + year + fev + she + already + try + they + bed + bug + symptom + dr + bit + care + weekend + hand + stomach + rest + old + hell + health + suck + us", data2, return_type = "dataframe")
# flatten y into a 1-D array
y = np.ravel(y)

# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model = model.fit(X,y)

# check the accuracy on the training set
model.score(X, y)


# In[ ]:
Ejemplo n.º 36
0
                      alpha=0.7)

    # Label the silhouette plots with their cluster numbers at the middle
    ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

    # Compute the new y_lower for next plot
    y_lower = y_upper + 10  # 10 for the 0 samples
    plt.show()

################ Linear Regression ################

#target Y = rating
#remove calories as it is a linear combinar of  everything else

#y, x = patsy.dmatrices("rating~protein+fat+sodium+fiber+carbo+sugars+potass+C(vitamins)+C(shelf)+cups+C(mfr)+C(cluster)",cereal)
y, x = patsy.dmatrices("rating~protein+fat+sodium+fiber+carbo+sugars+potass",
                       cereal)

pca_regression = pca_cereals.copy()

pca_regression.rename({
    0: "PC1",
    1: "PC2",
    2: "PC3",
    3: "PC4"
},
                      axis=1,
                      inplace=True)

pca_regression["rating"] = cereal["rating"]

y, x = patsy.dmatrices("rating~protein+fat+sodium+fiber+carbo+sugars+potass",
Ejemplo n.º 37
0
    def plot_predict(self,
                     h=5,
                     past_values=20,
                     intervals=True,
                     oos_data=None,
                     **kwargs):
        """ Plots forecast with the estimated model

        Parameters
        ----------
        h : int (default : 5)
            How many steps ahead would you like to forecast?

        past_values : int (default : 20)
            How many past observations to show on the forecast graph?

        intervals : Boolean
            Would you like to show prediction intervals for the forecast?

        oos_data : pd.DataFrame
            Data for the variables to be used out of sample (ys can be NaNs)

        Returns
        ----------
        - Plot of the forecast
        """
        import matplotlib.pyplot as plt

        figsize = kwargs.get('figsize', (10, 7))

        if self.latent_variables.estimated is False:
            raise Exception("No latent variables estimated!")
        else:

            # Retrieve data, dates and (transformed) latent variables
            _, X_oos = dmatrices(self.formula, oos_data)
            X_oos = np.array([X_oos])[0]
            X_pred = X_oos[:h]
            lmda, Y, scores, theta = self._model(
                self.latent_variables.get_z_values())
            date_index = self.shift_dates(h)
            t_params = self.transform_z()

            # Get mean prediction and simulations (for errors)
            mean_values = self._mean_prediction(lmda, Y, scores, h, t_params,
                                                X_pred)
            sim_values = self._sim_prediction(lmda, Y, scores, h, t_params,
                                              15000, X_pred)
            error_bars, forecasted_values, plot_values, plot_index = self._summarize_simulations(
                mean_values, sim_values, date_index, h, past_values)

            plt.figure(figsize=figsize)
            if intervals == True:
                alpha = [0.15 * i / float(100) for i in range(50, 12, -2)]
                for count, pre in enumerate(error_bars):
                    plt.fill_between(date_index[-h - 1:],
                                     np.exp((forecasted_values - pre) / 2),
                                     np.exp((forecasted_values + pre) / 2),
                                     alpha=alpha[count])

            plt.plot(plot_index, np.exp(plot_values / 2.0))
            plt.title("Forecast for " + self.data_name +
                      " Conditional Volatility")
            plt.xlabel("Time")
            plt.ylabel(self.data_name)
            plt.show()
Ejemplo n.º 38
0
def rformula(df, formula):
    """
    Split a data frame into X and y based on an R Formula.

    Based on patsy formulas. See
    https://patsy.readthedocs.io/en/latest/formulas.html for valid formulas.

    Returns
    -------

    A tuple where the first element is a pandas DataFrame containing the
    independent variables, and the second is a pandas Series containing the
    dependent variable.

    Example
    -------

    >>> df = pd.DataFrame(dict(a=[1, 2, 3], b=[4, 5, 6], c=[7, 8, 9]))
    >>> X, y = df.rformula('a ~ b')
    >>> X
       b
    0  4.0
    1  5.0
    2  6.0
    >>> y
         a
    0  1.0
    1  2.0
    2  3.0
    >>> X, y = df.rformula('c ~ a + b')
    >>> X
       a    b
    0  1.0  4.0
    1  2.0  5.0
    2  3.0  6.0
    >>> y
         c
    0  7.0
    1  8.0
    2  9.0
    >>> X, y = df.rformula('b ~ a + a:c')
    >>> X
         a   a:c
    0  1.0   7.0
    1  2.0  16.0
    2  3.0  27.0
    >>> y
         b
    0  4.0
    1  5.0
    2  6.0
    >>> X, y = df.rformula('b ~ a*c')
    >>> X
         a    c   a:c
    0  1.0  7.0   7.0
    1  2.0  8.0  16.0
    2  3.0  9.0  27.0
    >>> y
         b
    0  4.0
    1  5.0
    2  6.0
    """
    y, X = patsy.dmatrices(formula, df, return_type="dataframe")
    return X.drop(columns="Intercept"), y
plt.show()

# 观察低等舱逃生情况

# In[ ]:

lowclass = data.Survived[data.Pclass == 3].value_counts().sort_index()
lowclass.plot(kind='bar', label='Highclass', color='Blue', alpha=0.6)
plt.show()

# dmatrices将数据中的离散变量变成哑变量,并指明用Pclass, Sex, Embarked来预测Survived

# In[ ]:

y, X = dmatrices('Survived~ C(Pclass) + C(Sex) + Age + C(Embarked)',
                 data=data,
                 return_type='dataframe')
y = np.ravel(y)

# In[ ]:

model = LogisticRegression()

# In[ ]:

model.fit(X, y)

# 输出训练准确率

# In[ ]:
Ejemplo n.º 40
0
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score

#loading data
dta = sm.datasets.fair.load_pandas().data

# add "affair" column: 1 represents having affairs, 0 represents not
dta['affair'] = (dta.affairs > 0).astype(int)

#Prepare Data for Logistic Regression
#To prepare the data, I want to add an intercept column as well as dummy variables for occupation
# and occupation_husb, since I'm treating them as categorial variables.
#The dmatrices function from the patsy module can do that using formula language.
y, X = dmatrices('affair ~ rate_marriage + age + yrs_married + children + \
religious + educ + C(occupation) + C(occupation_husb)',
                 dta,
                 return_type="dataframe")

#rename the columns
X = X.rename(
    columns={
        'C(occupation)[T.2.0]': 'occ_2',
        'C(occupation)[T.3.0]': 'occ_3',
        'C(occupation)[T.4.0]': 'occ_4',
        'C(occupation)[T.5.0]': 'occ_5',
        'C(occupation)[T.6.0]': 'occ_6',
        'C(occupation_husb)[T.2.0]': 'occ_husb_2',
        'C(occupation_husb)[T.3.0]': 'occ_husb_3',
        'C(occupation_husb)[T.4.0]': 'occ_husb_4',
        'C(occupation_husb)[T.5.0]': 'occ_husb_5',
        'C(occupation_husb)[T.6.0]': 'occ_husb_6'
Ejemplo n.º 41
0
def tabulate_march_inequality(year):
    """
    #
    For years 1964-2009 (year is March year, not earnings year), tabulate:

    These inequality metrics:

    - 90/50, 50/10, 90/10, Vln
    - 60/50, 70/50, 80/50, 95/50, 97/50
    - 50/3, 50/5, 50/20, 50/30, 50/40

    For these samples

    - Males
    - Females
    - Both

    For these wage measures

    - All hourly

    For these conditioning variables

    - raw wage inequality
    - residual wage inequality

    Also note:

    - Always dropping allocators where possible

    D. Autor, 2/24/2004
    D. Autor, 6/15/2004 - Updated for consistency of controls for quantime simulation methods
    M. Anderson, 12/13/2005 - Updated for new quantiles and years
    D. Autor, 9/5/2006. Updated for 2005 March
    M. Wasserman, 10/14/2009 Updated for 2007/8 March
    #
    """

    df = tabulate_march_basic(year)
    df = df.eval("""
            lnwinc = log(winc_ws) + log(gdp)
            lnhinc = log(hinc_ws) + log(gdp)
        """)

    # Full-time and hourly samples
    df = df.eval("ftfy = fulltime*fullyear")
    df.ftfy.describe().to_frame().T
    df = df.eval("""
            ftsamp = (lnwinc == lnwinc) * ftfy * abs(bcwkwgkm-1)
            hrsamp = (lnhinc == lnhinc) * abs(bchrwgkm-1)
        """)
    # @ ftsamp: weekly real wage not none + ftfy + above weekly real wage limit
    # @ hrsamp: hourly real wage not none + above hourly real wage limit

    df.loc[df.ftsamp == 0, "lnwinc"] = np.nan
    df.loc[df.hrsamp == 0, "lnhinc"] = np.nan
    df.query("ftsamp == 1")["lnwinc"].describe().to_frame().T
    df.query("hrsamp == 1")["lnhinc"].describe().to_frame().T
    df = df.query("ftsamp == 1 | hrsamp == 1")

    # Generate experience categories
    df = df.assign(expcat=(df.exp/3).astype(int) + 1)
    df.loc[df.expcat == 17, "expcat"] = 16
    assert df.eval("1<= expcat <= 16").all()

    df.groupby("expcat")["exp"].agg(["mean", "min", "max"])

    # interaction terms - 80 of these
    # @ move to residual wage part

    # Drop reference group's interaction term: HSG with 0-2 years of experience
    # @ simiarly skip now

    df = df.filter(["year", "wgt", "wgt_hrs", "female", "lnwinc", "lnhinc", "hrsamp", "ftsamp", "edcat", "expcat"])

    ######################################################################
    # Summarize raw inequality
    ######################################################################

    pctiles = pd.Series([3, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 97])
    pctiles_ = pctiles / 100
    tot_pct = pd.DataFrame(index=pctiles)
    tot_stat = pd.DataFrame(index=["mn", "vln"])

    dt = df.query("ftsamp==1")
    wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt)
    tot_pct["tot_ft_mf"] = wq.quantile(probs=pctiles_, return_pandas=False)
    tot_stat["tot_ft_mf"] = [wq.mean, wq.var]

    dt = df.query("ftsamp==1 & female==0")
    wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt)
    tot_pct["tot_ft_m"] = wq.quantile(probs=pctiles_, return_pandas=False)
    tot_stat["tot_ft_m"] = [wq.mean, wq.var]

    dt = df.query("ftsamp==1 & female==1")
    wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt)
    tot_pct["tot_ft_f"] = wq.quantile(probs=pctiles_, return_pandas=False)
    tot_stat["tot_ft_f"] = [wq.mean, wq.var]

    dt = df.query("hrsamp==1")
    wq = DescrStatsW(data=dt.lnhinc, weights=dt.wgt_hrs)
    tot_pct["tot_hr_mf"] = wq.quantile(probs=pctiles_, return_pandas=False)
    tot_stat["tot_hr_mf"] = [wq.mean, wq.var]

    dt = df.query("hrsamp==1 & female==0")
    wq = DescrStatsW(data=dt.lnhinc, weights=dt.wgt_hrs)
    tot_pct["tot_hr_m"] = wq.quantile(probs=pctiles_, return_pandas=False)
    tot_stat["tot_hr_m"] = [wq.mean, wq.var]

    dt = df.query("hrsamp==1 & female==1")
    wq = DescrStatsW(data=dt.lnhinc, weights=dt.wgt_hrs)
    tot_pct["tot_hr_f"] = wq.quantile(probs=pctiles_, return_pandas=False)
    tot_stat["tot_hr_f"] = [wq.mean, wq.var]

    df_stat = pd.concat([tot_stat, tot_pct], axis=0, sort=False)

    ######################################################################
    # Summarize residual inequality - Weekly & Hourly
    ######################################################################

    res_pct = pd.DataFrame(index=pctiles)
    res_stat = pd.DataFrame(index=["mn", "vln"])

    dt = df.query("ftsamp==1")
    y, X = dmatrices('lnwinc ~ female + C(edcat) : C(expcat) - 1', dt, return_type="dataframe")
    X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1))
    res = sm.WLS(y, X, weights=dt.wgt).fit()
    resid = res.resid
    wq = DescrStatsW(data=resid, weights=dt.wgt)
    res_stat["res_ft_mf"] = [wq.mean, wq.var]  # @ mean is not necessary but to be consistent
    res_pct["res_ft_mf"] = wq.quantile(probs=pctiles_, return_pandas=False)

    dt = df.query("ftsamp==1 & female==0")
    y, X = dmatrices('lnwinc ~ C(edcat) : C(expcat) - 1', dt, return_type="dataframe")
    X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1))
    res = sm.WLS(y, X, weights=dt.wgt).fit()
    resid = res.resid
    wq = DescrStatsW(data=resid, weights=dt.wgt)
    res_stat["res_ft_m"] = [wq.mean, wq.var]
    res_pct["res_ft_m"] = wq.quantile(probs=pctiles_, return_pandas=False)

    dt = df.query("ftsamp==1 & female==1")
    y, X = dmatrices('lnwinc ~ C(edcat) : C(expcat) - 1', dt, return_type="dataframe")
    X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1))
    res = sm.WLS(y, X, weights=dt.wgt).fit()
    resid = res.resid
    wq = DescrStatsW(data=resid, weights=dt.wgt)
    res_stat["res_ft_f"] = [wq.mean, wq.var]
    res_pct["res_ft_f"] = wq.quantile(probs=pctiles_, return_pandas=False)

    dt = df.query("hrsamp==1")
    y, X = dmatrices('lnhinc ~ female + C(edcat) : C(expcat) - 1', dt, return_type="dataframe")
    X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1))
    res = sm.WLS(y, X, weights=dt.wgt_hrs).fit()
    resid = res.resid
    wq = DescrStatsW(data=resid, weights=dt.wgt_hrs)
    res_stat["res_hr_mf"] = [wq.mean, wq.var]
    res_pct["res_hr_mf"] = wq.quantile(probs=pctiles_, return_pandas=False)

    dt = df.query("hrsamp==1 & female==0")
    y, X = dmatrices('lnhinc ~ C(edcat) : C(expcat) - 1', dt, return_type="dataframe")
    X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1))
    res = sm.WLS(y, X, weights=dt.wgt_hrs).fit()
    resid = res.resid
    wq = DescrStatsW(data=resid, weights=dt.wgt_hrs)
    res_stat["res_hr_m"] = [wq.mean, wq.var]
    res_pct["res_hr_m"] = wq.quantile(probs=pctiles_, return_pandas=False)

    dt = df.query("hrsamp==1 & female==1")
    y, X = dmatrices('lnhinc ~ C(edcat) : C(expcat) - 1', dt, return_type="dataframe")
    X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1))
    res = sm.WLS(y, X, weights=dt.wgt_hrs).fit()
    resid = res.resid
    wq = DescrStatsW(data=resid, weights=dt.wgt_hrs)
    res_stat["res_hr_f"] = [wq.mean, wq.var]
    res_pct["res_hr_f"] = wq.quantile(probs=pctiles_, return_pandas=False)

    df_stat_ = pd.concat([res_stat, res_pct], axis=0)
    df_stat = pd.concat([df_stat, df_stat_], axis=1)

    # march-ineq-data-`1'
    df_stat = df_stat.T.rename_axis('sample').reset_index().assign(year=year)  # @ tidy data

    ######################################################################
    # Percentiles of weekly earnings
    ######################################################################

    # @ simply generate more percentiles under full-time samples
    # @ note here year is march census year thus minus one to be earnings year

    pctiles = pd.Series(range(3, 98))
    pctiles_ = pctiles / 100
    tot_pct = pd.DataFrame(index=pctiles)

    dt = df.query("ftsamp==1")
    wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt)
    tot_pct["tot_ft_mf"] = wq.quantile(probs=pctiles_, return_pandas=False)

    dt = df.query("ftsamp==1 & female==0")
    wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt)
    tot_pct["tot_ft_m"] = wq.quantile(probs=pctiles_, return_pandas=False)

    dt = df.query("ftsamp==1 & female==1")
    wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt)
    tot_pct["tot_ft_f"] = wq.quantile(probs=pctiles_, return_pandas=False)

    # march-pctile-`yr'
    tot_pct = tot_pct.T.rename_axis('sample').reset_index().assign(year=year-1)  # @ tidy data

    # @ the code then combine 1963-2008 generated files
    # @ we remove this as not sure necessary
    # @ actually this part can be combined with #Summarize raw inequality#

    return df_stat, tot_pct
Ejemplo n.º 42
0
 def __init__(self, formula, data):
     family = Gaussian()
     smooth_info = parse_smooths(formula, data)
     formula = get_parametric_formula(formula)
     y, Xp = patsy.dmatrices(formula,
                             data,
                             return_type='dataframe',
                             eval_env=1)
     varnames = Xp.columns.tolist()
     smooths = {}
     start = p = Xp.shape[1]
     ns = 0
     for key, val in smooth_info.items():
         slist = get_smooth(**val)
         if len(slist) == 1:
             smooths[key], = slist
             p_i = smooths[key]['X'].shape[1]
             varnames += [f"{key}{j}" for j in range(1, p_i + 1)]
             p += p_i
             ns += 1
         else:
             for i, x in enumerate(slist):
                 by_key = f"{key}_{x['by_cat']}"
                 smooths[by_key] = x
                 p_i = x['X'].shape[1]
                 varnames += [f"{by_key}_{j}" for j in range(1, p_i + 1)]
                 p += p_i
                 ns += 1
     X, S, Sj, ranks, ldS = [Xp], np.zeros((ns, p, p)), [], [], []
     for i, (var, s) in enumerate(smooths.items()):
         p_i = s['X'].shape[1]
         Si, ix = np.zeros((p, p)), np.arange(start, start + p_i)
         start += p_i
         Si[ix, ix.reshape(-1, 1)] = s['S']
         smooths[var]['ix'], smooths[var]['Si'] = ix, Si
         X.append(smooths[var]['X'])
         S[i] = Si
         Sj.append(s['S'])
         ranks.append(np.linalg.matrix_rank(Si))
         u = np.linalg.eigvals(s['S'])
         ldS.append(np.log(u[u > np.finfo(float).eps]).sum())
     self.X, self.Xp, self.y = np.concatenate(
         X, axis=1), Xp.values, y.values[:, 0]
     self.S, self.Sj, self.ranks, self.ldS = S, Sj, ranks, ldS
     self.f, self.smooths = family, smooths
     self.ns, self.n_obs, self.nx = ns, self.X.shape[0], self.X.shape[1]
     self.mp = self.nx - np.sum(self.ranks)
     self.data = data
     theta = np.zeros(self.ns + 1)
     for i, (var, s) in enumerate(smooths.items()):
         ix = smooths[var]['ix']
         a = self.S[i][ix, ix[:, None].T]
         d = np.diag(self.X[:, ix].T.dot(self.X[:, ix]))
         lam = (1.5 * (d / a)[a > 0]).mean()
         theta[i] = np.log(lam)
         varnames += [f"log_smooth_{var}"]
     theta[-1] = 1.0
     varnames += ["log_scale"]
     self.theta = theta
     self.varnames = varnames
     self.smooth_info = smooth_info
Ejemplo n.º 43
0
import statsmodels.formula.api as smf
from patsy import dmatrices
from sklearn.feature_selection import SelectKBest, f_classif, f_regression
from sklearn.linear_model import Ridge
from sklearn.preprocessing import scale
from sklearn.linear_model import Lasso

df = pd.read_csv('Hitters.csv').dropna()

#remove unusable data
df.iloc[:, 1:19].drop('League', 1).drop('Division', 1)

#split data
train, test = np.split(df.sample(frac=1), [int(0.5 * len(df))])
formula = 'Salary~AtBat+Hits+HmRun+Runs+RBI+Walks+Years+CAtBat+CHits+CHmRun+CRuns+CRBI+CWalks+PutOuts+Assists+Errors'
y_train, x_train = dmatrices(formula, train, return_type='dataframe')
y_test, x_test = dmatrices(formula, test, return_type='dataframe')

#our two methods
ridge = Ridge()
lasso = Lasso(max_iter=10000)
alphas_selected = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

for a in alphas_selected:
    ridge.set_params(alpha=a)
    lasso.set_params(alpha=a)
    ridge.fit(scale(x_train), scale(y_train))
    lasso.fit(scale(x_train), scale(y_train))
    preds_ridge = ridge.predict(x_train)
    preds_lasso = lasso.predict(x_train)
    print('MSE RIDGE alpha=', a, ":",
Ejemplo n.º 44
0
def vif(dt, y, x=None, merge_coef=False, positive="bad|1"):
    '''
    Variance Inflation Factors
    ------
    vif calculates variance-inflation factors for logistic regression.
    
    Params
    ------
    dt: A data frame with both x (predictor/feature) and y (response/label) variables.
    y: Name of y variable.
    x: Name of x variables. Default is None. If x is None, 
      then all variables except y are counted as x variables.
    merge_coef: Logical, whether to merge with coefficients of model summary matrix. Defaults to FALSE.
    positive: Value of positive class, default "bad|1".
    
    Returns
    ------
    data frame
        A data frame with columns for variable and gvif.
    
    Examples
    ------
    import scorecardpy as sc
    
    # load data
    dat = sc.germancredit()
    
    # Example I
    sc.vif(dat, 
        y = 'creditability', 
        x=['age_in_years', 'credit_amount', 'present_residence_since'], 
        merge_coef=True)
    '''

    dt = dt.copy(deep=True)
    if isinstance(y, str):
        y = [y]
    if isinstance(x, str) and x is not None:
        x = [x]
    if x is not None:
        dt = dt[y + x]
    # check y
    dt = check_y(dt, y, positive)
    # x variables
    x = x_variable(dt, y, x)

    # dty, dtx
    ytrain = dt.loc[:, y]
    Xtrain = dt.loc[:, x]
    Xtrain = sm.add_constant(Xtrain)

    # logistic regression
    lrfit = sm.GLM(ytrain.astype(float),
                   Xtrain.astype(float),
                   family=sm.families.Binomial()).fit()

    # vif
    dty, dtX = dmatrices(' ~ '.join([y[0], '+'.join(x)]),
                         data=dt,
                         return_type="dataframe")
    dfvif = pd.DataFrame({
        'variables':
        ['const', 'age_in_years', 'credit_amount', 'present_residence_since'],
        'vif': [
            variance_inflation_factor(dtX.values, i)
            for i in range(dtX.shape[1])
        ]
    })

    # merge with coef
    if merge_coef:
        dfvif = pd.merge(lrfit.summary2().tables[1].reset_index().rename(
            columns={'index': 'variables'}),
                         dfvif,
                         on='variables',
                         how='outer')
    return dfvif
Ejemplo n.º 45
0
PCS = [1, 2, 3]
pdf = PdfPages(os.path.join(OUTPUT, "pc_clinic_associations.pdf"))
for target in targets:
    #target = 'MMSE'
    #target = 'TMTB'
    dt = data[data[target].notnull()]
    y = dt[target]
    fig, axarr = plt.subplots(1, 3)  #, sharey=True)
    fig.set_figwidth(15)
    print fig.get_figwidth()
    for j, pc in enumerate(PCS):
        #j, pc = 2, 3
        # --------------------------------
        model = '%s~PC%s+AGE_AT_INCLUSION+SEX+EDUCATION+BPF+LLV' % (target, pc)
        # --------------------------------
        y, X = dmatrices(model, data=dt, return_type='dataframe')
        mod = sm.OLS(y, X).fit()
        test = mod.t_test([0, 1] + [0] * (X.shape[1] - 2))
        tval, pval = test.tvalue[0, 0], test.pvalue[0, 0]
        x = dt["PC%i" % pc]
        axarr[j].scatter(x, y)
        if False:
            for i in xrange(len(dt['Subject ID'])):
                axarr[j].text(dt.ix[i, "PC%i" % pc], y.ix[i, 0],
                              dt['Subject ID'][i])
        x_ext = np.array([x.min(), x.max()])
        y_ext = x_ext * mod.params[1] + y.mean().values  #mod.params[0]
        axarr[j].plot(x_ext, y_ext, "red")
        if j == 0:
            axarr[j].set_ylabel(target)
        axarr[j].set_xlabel('PC%i (T=%.3f, P=%.4g)' % (pc, tval, pval))
Ejemplo n.º 46
0
    def plot_predict(self,
                     h=5,
                     past_values=20,
                     intervals=True,
                     oos_data=None,
                     **kwargs):
        """ Makes forecast with the estimated model

        Parameters
        ----------
        h : int (default : 5)
            How many steps ahead would you like to forecast?

        past_values : int (default : 20)
            How many past observations to show on the forecast graph?

        intervals : Boolean
            Would you like to show 95% prediction intervals for the forecast?

        oos_data : pd.DataFrame
            Data for the variables to be used out of sample (ys can be NaNs)

        Returns
        ----------
        - Plot of the forecast
        """

        figsize = kwargs.get('figsize', (10, 7))

        if self.parameters.estimated is False:
            raise Exception("No parameters estimated!")
        else:
            # Sort/manipulate the out-of-sample data
            _, X_oos = dmatrices(self.formula, oos_data)
            X_oos = np.array([X_oos])[0]
            full_X = self.X.copy()
            full_X = np.append(full_X, X_oos, axis=0)
            Z = full_X

            # Retrieve data, dates and (transformed) parameters
            a, P = self._forecast_model(self.parameters.get_parameter_values(),
                                        Z, h)
            smoothed_series = np.zeros(self.y.shape[0] + h)
            series_variance = np.zeros(self.y.shape[0] + h)
            for t in range(self.y.shape[0] + h):
                smoothed_series[t] = np.dot(Z[t], a[:, t])
                series_variance[t] = np.dot(
                    np.dot(Z[t], P[:, :, t]), Z[t].T
                ) + self.parameters.parameter_list[0].prior.transform(
                    self.parameters.get_parameter_values()[0])

            date_index = self.shift_dates(h)
            plot_values = smoothed_series[-h - past_values:]
            forecasted_values = smoothed_series[-h:]
            lower = forecasted_values - 1.98 * np.power(
                series_variance[-h:], 0.5)
            upper = forecasted_values + 1.98 * np.power(
                series_variance[-h:], 0.5)
            lower = np.append(plot_values[-h - 1], lower)
            upper = np.append(plot_values[-h - 1], upper)

            plot_index = date_index[-h - past_values:]

            plt.figure(figsize=figsize)
            if intervals == True:
                plt.fill_between(date_index[-h - 1:], lower, upper, alpha=0.2)

            plt.plot(plot_index, plot_values)
            plt.title("Forecast for " + self.y_name)
            plt.xlabel("Time")
            plt.ylabel(self.y_name)
            plt.show()
Ejemplo n.º 47
0
    def __init__(self, data, formula, ar, ma, integ=0, family=fam.Normal()):

        # Initialize TSM object
        super(ARIMAX,self).__init__('ARIMAX')

        # Latent Variables
        self.ar = ar
        self.ma = ma
        self.integ = integ
        self.z_no = self.ar + self.ma + 2
        self.max_lag = max(self.ar, self.ma)
        self._z_hide = 0 # Whether to cutoff latent variables from results table
        self.supported_methods = ["MLE", "PML", "Laplace", "M-H", "BBVI"]
        self.default_method = "MLE"
        self.multivariate_model = False

        # Format the data
        self.is_pandas = True # This is compulsory for this model type
        self.data_original = data.copy()
        self.formula = formula
        self.y, self.X = dmatrices(formula, data)
        self.y_name = self.y.design_info.describe()
        self.X_names = self.X.design_info.describe().split(" + ")
        self.y = self.y.astype(np.float) 
        self.X = self.X.astype(np.float) 
        self.z_no = self.X.shape[1]
        self.data_name = self.y_name
        self.y = np.array([self.y]).ravel()
        self.data = self.y.copy()
        self.X = np.array([self.X])[0]
        self.index = data.index

        # Difference data
        for order in range(0, self.integ):
            self.y = np.diff(self.y)
            self.data = np.diff(self.data)
            self.data_name = "Differenced " + self.data_name

        self.data_length = self.data.shape[0]
        self.ar_matrix = self._ar_matrix()
        self._create_latent_variables()

        self.family = family
        self.model_name2, self.link, self.scale, self.shape, self.skewness, self.mean_transform, self.cythonized = self.family.setup()
        self.model_name = self.model_name2 + " ARIMAX(" + str(self.ar) + "," + str(self.integ) + "," + str(self.ma) + ")"

        # Build any remaining latent variables that are specific to the family chosen
        for no, i in enumerate(self.family.build_latent_variables()):
            self.latent_variables.add_z(i[0], i[1], i[2])
            self.latent_variables.z_list[no+self.ar+self.ma+self.X.shape[1]].start = i[3]

        self.family_z_no = len(self.family.build_latent_variables())
        self.z_no = len(self.latent_variables.z_list)

        # If Normal family is selected, we use faster likelihood functions
        if isinstance(self.family, fam.Normal):
            self._model = self._normal_model
            self._mb_model = self._mb_normal_model
            self.neg_loglik = self.normal_neg_loglik
            self.mb_neg_loglik = self.normal_mb_neg_loglik
        else:
            self._model = self._non_normal_model
            self._mb_model = self._mb_non_normal_model
            self.neg_loglik = self.non_normal_neg_loglik
            self.mb_neg_loglik = self.non_normal_mb_neg_loglik
Ejemplo n.º 48
0
    # ['scale(np.log(active_methods +0.5))'],
    # ['scale(np.log(query_commits+0.5))'],
]

params = []
pvalues = []
prsquared = []
factors = []
vifmax = 0.0
corrmax = 0.0

mdls = []

for add in models:
    factors = factors + add
    y, X = patsy.dmatrices('is_sparql~ ' + reduce(lambda x, y: x + ' + ' + y, factors), data=repositories,
                           return_type='dataframe')

    mod = sm.OLS(y, X)
    res = mod.fit()
    mdls.append(res)
    print(res.summary())
    params.append(res.params)
    pvalues.append(res.pvalues)
    prsquared.append(res.rsquared)


    vif = pd.DataFrame()
    vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif["features"] = X.columns

    corr = X.corr()
Ejemplo n.º 49
0
test_df = pd.read_csv(test_path)

mean_age = np.mean(train_df.Age)
train_df.Age = train_df.Age.fillna(mean_age)

test_df.Age = test_df.Age.fillna(mean_age)
test_df["Survived"] = 0
test_passengers = test_df.PassengerId.values

formula_ml = "Survived ~ C(Pclass) + C(Sex) + Age + SibSp + Parch + C(Embarked)"
#formula_ml = "Survived ~ C(Sex)"

results = {}


train_y, train_x = dmatrices(formula_ml, data=train_df, return_type="dataframe")
test_y, test_x = dmatrices(formula_ml, data=test_df, return_type="dataframe")

#train_y = np.asarray(train_y).ravel()

#Logistic Regression
model = sm.Logit(train_y, train_x)
res = model.fit()

output = res.predict(test_x)
output = np.asanyarray(output).ravel()
output = np.round(output)
output_file = open("myLogit2.csv", "wb")
output_file_object = csv.writer(output_file)
output_file_object.writerow(["PassengerId", "Survived"])
output_file_object.writerows(zip(test_passengers, output))
Ejemplo n.º 50
0
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from patsy import dmatrices
import statsmodels.api as sm

from sessionData import session

matplotlib.rcParams['pdf.fonttype'] = 42

d = import_data()
df = create_df(d)

maxResp = d['maxResponseWaitFrames'][()]

y, X = dmatrices('resp ~ rewDir + mask', data=df, return_type='dataframe')

mod = sm.OLS(y, X)

res = mod.fit()

res.summary()

#onehotencode the categorical variables (resp, dir))
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

#List of (name, transformer, column(s)) tuples
ct = ColumnTransformer([('encode1', OneHotEncoder(df), 'rewDir'),
                        ('encode2', OneHotEncoder(df), 'resp')],
                       remainder='passthrough')
    axis=1)
# Take differences between following month and the current month
monthly_final_df['difflanefrac'] = monthly_final_df['nextlanefrac'].subtract(
    monthly_final_df['lanefrac'])
monthly_final_df['diffcount'] = monthly_final_df['nextcount'].subtract(
    monthly_final_df['count'])
monthly_final_df['difftotaltrips'] = monthly_final_df[
    'nexttotaltrips'].subtract(monthly_final_df['totaltrips'])

pickle.dump(
    monthly_final_df,
    open(
        "C:/Users/fhp7/Desktop/Cornell/CEE 4620/Final Project/Model/monthly_final_df.p",
        'wb'))

#%% Perform regression on difference data
print("Performing Regression")
# Remove all records that have no infrastructure change
regress_df = monthly_final_df.loc[(monthly_final_df.difflanefrac != 0) & \
                                  (pd.notnull(monthly_final_df.difflanefrac))]

regress_df['logdifflanefrac'] = regress_df['difflanefrac'].apply(np.log)

# Use patsy to generate design matrix and target vector
y, X = dmatrices('diffcount ~ difflanefrac + difftotaltrips',
                 data=regress_df,
                 return_type='dataframe')

# Fit the model using statsmodels and print the results
result = sm.OLS(y, X).fit()
print(result.summary())
########################################loading################################
#read data
traindf = pd.read_csv(train_file)
##clean data
df = clean_and_munge_data(traindf)

# ## Part 3: Creating a Random Forest Classifier with Cross Validation  ##

# In[ ]:

########################################formula################################

formula_ml = 'Survived~Pclass+C(Title)+Sex+C(AgeCat)+Fare_Per_Person+Fare+Family_Size'

y_train, x_train = dmatrices(formula_ml, data=df, return_type='dataframe')
y_train = np.asarray(y_train).ravel()

print(y_train.shape, x_train.shape)

##select a train and test set
X_train, X_test, Y_train, Y_test = train_test_split(x_train,
                                                    y_train,
                                                    test_size=0.2,
                                                    random_state=seed)

#instantiate and fit our model
clf = RandomForestClassifier(n_estimators=500,
                             criterion='entropy',
                             max_depth=5,
                             min_samples_split=2,
Ejemplo n.º 53
0
def Problem_7():    
    df = pd.DataFrame(data = {'y': # Miles/gal
                              [18.90, 17.00, 20.00, 18.25,
                               20.07, 11.20, 22.12, 21.47,
                               34.70, 30.40, 16.50, 36.50,
                               21.50, 19.70, 20.30, 17.80,
                               14.39, 14.89, 17.80, 16.41,
                               23.54, 21.47, 16.59, 31.90,
                               29.40, 13.27, 23.90, 19.73,
                               13.90, 13.27, 13.77, 16.50],
                              'X1': # Displacement (in^3)
                              [350, 350, 250, 351,
                               225, 440, 231, 262,
                               89.7, 96.9, 350, 85.3,
                               171, 258, 140, 302,
                               500, 440, 350, 318,
                               231, 360, 400, 96.9,
                               140, 460, 133.6, 318,
                               351, 351, 360, 350],
                              'X2': # Weight (lbs)
                              [3910, 3860, 3510, 3890,
                               3365, 4215, 3020, 3180,
                               1905, 2320, 3885, 2009,
                               2655, 3375, 2700, 3890,
                               5290, 5185, 3910, 3660,
                               3050, 4250, 3850, 2275,
                               2150, 5430, 2535, 4370,
                               4540, 4715, 4215, 3660]},
                      index = ['Apollo', 'Omega', 'Nova', 'Monarch',
                               'Duster', 'Jenson Conv.', 'Skyhawk', 'Monza',
                               'Scirocco', 'Corolla SR-5', 'Camaro',
                               'Datsun B210', 'Capri II', 'Pacer', 'Bobcat',
                               'Granada', 'Eldorado', 'Imperial', 'Nova LN',
                               'Valiant', 'Starfire', 'Cordoba', 'Trans Am',
                               'Corolla E-5', 'Astre', 'Mark IV', 'Celica GT',
                               'Charger SE', 'Cougar', 'Elite', 'Matador',
                               'Corvette'])
    
    ###############
    # Problem 7.a #
    ###############
    title_print('Problem 7.a')
    
    y, X = patsy.dmatrices('y ~ X1 + X2', df)
    model = sm.OLS(y, X)
    results = model.fit()
    results.model.data.design_info = X.design_info
    
    print('> y = {} + {} * x1 + {} * x2 + e <'.format(
          round(results.params[0], 3),
          round(results.params[1], 3),
          round(results.params[2], 3)).center(80, '-'))

    ###############
    # Problem 7.b #
    ###############
    title_print('Problem 7.b')

    aov_table = sm.stats.anova_lm(results, typ = 1)
    print('\n--- Analysis of Variance table ---\n{}'.format(aov_table))
    print('\nRegression F: {}'.format(round(results.fvalue, 2)))
    print('Regression p: {}\n'.format(round(results.f_pvalue, 4)))
    print('> Based on P-values, X1 is significant, X2 is not <'.\
          center(80, '-'))

    ###############
    # Problem 7.c #
    ###############
    title_print('Problem 7.c')
    
    print('> R-squared explains {}% of total variability <'.\
          format(round(results.rsquared * 100, 2)).center(80, '-'))

    ###############
    # Problem 7.d #
    ###############
    title_print('Problem 7.d')

    conf_int = np.round(results.conf_int(), 5)
    
    print('> 95% Confidence Intervals <'.center(80, '-'))
    print('> Intercept: {} <'.format(conf_int[0]).center(80, '-'))
    print('> B1: {} <'.format(conf_int[1]).center(80, '-'))
    print('> B2: {} <'.format(conf_int[2]).center(80, '-'))
    print('> 95% confident respective slopes are between these values <'.\
          center(80, '-'))

    ###############
    # Problem 7.e #
    ###############
    title_print('Problem 7.e')

    intervals = np.round(results.get_prediction([1, 275, 3000]).\
                         summary_frame(alpha = 0.05), 4)

    print('> 95% Confidence Interval <'.center(80, '-'))
    print('> {} to {} <'.format(intervals['mean_ci_lower'].values,
                                intervals['mean_ci_upper'].values).\
                         center(80, '-'))
    print('> 95% confident interval contains true mean <'.center(80, '-'))
    
    ###############
    # Problem 7.f #
    ###############
    title_print('Problem 7.f')
    
    print('> 95% Prediction Interval <'.center(80, '-'))
    print('> {} to {} <'.format(intervals['obs_ci_lower'].values,
                                intervals['obs_ci_upper'].values).\
                         center(80, '-'))
    print('> 95% confident interval contains prediction <'.center(80, '-'))

    ###############
    # Problem 7.g #
    ###############
    title_print('Problem 7.g')
    
    print('> Prediction interval is wider <'.center(80, '-'))
    print('> More uncertainty when making single/specific prediction <'.\
          center(80, '-'))
    
    #################
    # Problem 7.h.1 #
    #################
    title_print('Problem 7.h.1')
    
    residuals = results.resid
    prob = [(i - 1/2) / len(y) for i in range(len(y))]
    
    # Can plot straight line for visuals
    resid_results = sm.OLS(prob, sm.add_constant(sorted(residuals))).fit()
    X_range = np.linspace(min(residuals), max(residuals), len(residuals))
    
    # Normal Probability Plot + straight line
    fig, ax = plt.subplots()
    ax.scatter(sorted(residuals), prob)
    ax.plot(X_range,
            resid_results.params[0] + resid_results.params[1] * X_range)
    ax.set_xlabel('Residual')
    ax.set_ylabel('Probability')
    ax.set_ylim(0, 1)
    plt.title('Normal Probability Plot')
    plt.show()
    
    print('> Does not appear to be problem with normality <'.center(80, '-'))

    #################
    # Problem 7.h.2 #
    #################
    title_print('Problem 7.h.2')
    
    fig, ax = plt.subplots()
    ax.scatter(results.fittedvalues, residuals)
    ax.axhline(0)
    ax.set_xlabel('Fitted Values')
    ax.set_ylabel('Residuals')
    plt.title('Residuals Versus Predicted Response')
    plt.show()
    
    print('> Definite non-linear pattern. Either slight downward trend <'.\
          center(80, '-'))
    print('> if you disregard 5 points in upper right. OR somewhat <'.\
          center(80, '-'))
    print('> quadratic if disregard 3 points in lower right <'.center(80, '-'))
    
    #################
    # Problem 7.h.3 #
    #################
    title_print('Problem 7.h.3')
    
    fig, ax = plt.subplots()
    ax2 = ax.twiny()
    scat_1 = ax.plot(df['X1'], residuals,
                     marker = '*', linestyle = '', color = 'orange', label = 'X1')
    scat_2 = ax2.plot(df['X2'], residuals,
                      marker = 'o', linestyle = '', color = 'black', label = 'X2')
    ax.axhline(0)
    ax.set_xlabel('X_1')
    ax2.set_xlabel('X_2')
    ax.set_ylabel('Residuals')
    
    plots = scat_1 + scat_2
    labels = [label.get_label() for label in plots]
    ax.legend(plots, labels, loc = 'lower right')
    plt.title('Residuals Versus X_i')
    plt.show()
    
    print('> One y value plotted for each X-value <'.center(80, '-'))
    print('> Non-linear pattern trends to upper right <'.center(80, '-'))

    return df, results
Ejemplo n.º 54
0
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from datetime  import datetime
import quandl

 
df = pd.read_csv(r"data_third.csv")
del df['Split Ratio']



df['Date'] = pd.to_datetime(df['Date'])
df['Date'] = df['Date'] - df['Date'][0]
df['Date'] = df['Date'].dt.days
dfn = (df - df.mean()) / (df.max() - df.min())#normalization
pt_y, pt_x = pt.dmatrices("Close ~ Open", dfn)
res = np.linalg.lstsq(pt_x, pt_y)
b0 = res[0].ravel()
print ("Close ~ Open ", b0)

"""
ax = plt.subplot()
scatter_matrix(df, alpha=0.05, figsize=(10, 10), marker ='x')

ax.plot(dfn['Close'], dfn['Open'], 'go', color = 'blue')#x[x]
axis_x = np.linspace(-1, 1, 100)
f = b0[0] + b0[1] * axis_x 
ax.plot(axis_x, f, color = 'red')
"""
ax = plt.subplot()
plt.scatter(dfn['Date'], dfn['Close'], c='blue', s=20, label='blue',
Ejemplo n.º 55
0
    def plot_predict(self, h=5, past_values=20, intervals=True, oos_data=None, **kwargs):
        """ Plots forecasts with the estimated model

        Parameters
        ----------
        h : int (default : 5)
            How many steps ahead would you like to forecast?

        past_values : int (default : 20)
            How many past observations to show on the forecast plot?

        intervals : Boolean
            Would you like to show prediction intervals for the forecast?

        oos_data : pd.DataFrame
            Data for the variables to be used out of sample (ys can be NaNs)

        Returns
        ----------
        - Plot of the forecast
        """
        import matplotlib.pyplot as plt
        import seaborn as sns

        figsize = kwargs.get('figsize',(10,7))

        if self.latent_variables.estimated is False:
            raise Exception("No latent variables estimated!")
        else:
            _, X_oos = dmatrices(self.formula, oos_data)
            X_oos = np.array([X_oos])[0]
            X_pred = X_oos[:h]

            # Retrieve data, dates and (transformed) latent variables
            mu, Y = self._model(self.latent_variables.get_z_values())         
            date_index = self.shift_dates(h)

            if self.latent_variables.estimation_method in ['M-H']:
                sim_vector = self._sim_prediction_bayes(h, X_pred, 15000)
                error_bars = []

                for pre in range(5,100,5):
                    error_bars.append(np.insert([np.percentile(i,pre) for i in sim_vector], 0, Y[-1]))

                forecasted_values = np.insert([np.mean(i) for i in sim_vector], 0, Y[-1])
                plot_values = np.append(Y[-1-past_values:-2], forecasted_values)
                plot_index = date_index[-h-past_values:]

            else:
                t_z = self.transform_z()
                mean_values = self._mean_prediction(mu, Y, h, t_z, X_pred)

                if self.model_name2 == "Skewt":
                    model_scale, model_shape, model_skewness = self._get_scale_and_shape(t_z)
                    m1 = (np.sqrt(model_shape)*sp.gamma((model_shape-1.0)/2.0))/(np.sqrt(np.pi)*sp.gamma(model_shape/2.0))
                    forecasted_values = mean_values[-h:] + (model_skewness - (1.0/model_skewness))*model_scale*m1 
                else:
                    forecasted_values = mean_values[-h:] 

                if intervals is True:
                    sim_values = self._sim_prediction(mu, Y, h, t_z, X_pred, 15000)
                else:
                    sim_values = self._sim_prediction(mu, Y, h, t_z, X_pred, 2)

                error_bars, forecasted_values, plot_values, plot_index = self._summarize_simulations(mean_values, sim_values, date_index, h, past_values)

            plt.figure(figsize=figsize)
            if intervals == True:
                alpha =[0.15*i/float(100) for i in range(50,12,-2)]
                for count, pre in enumerate(error_bars):
                    plt.fill_between(date_index[-h-1:], error_bars[count], error_bars[-count-1],alpha=alpha[count])             
            plt.plot(plot_index,plot_values)
            plt.title("Forecast for " + self.data_name)
            plt.xlabel("Time")
            plt.ylabel(self.data_name)
            plt.show()
Ejemplo n.º 56
0
def _epoch_spans(recspan_intern_table, data_set, rerp_specs, eval_env):
    rerp_infos = []
    rerp_names = set()
    spans = []
    design_offset = 0
    expanded_design_offset = 0
    data_format = data_set.data_format
    for rerp_idx, rerp_spec in enumerate(rerp_specs):
        start_offset = data_format.ms_to_samples(rerp_spec.start_time)
        # Offsets are half open: [start, stop)
        # But, it's more intuitive for times to be closed: [start, stop]
        # So we interpret the user times as a closed interval, and add 1
        # sample when converting to offsets.
        stop_offset = 1 + data_format.ms_to_samples(rerp_spec.stop_time)
        if start_offset >= stop_offset:
            raise ValueError("Epochs must be >1 sample long!")
        event_set = data_set.events.find(rerp_spec.event_query)
        # Tricky bit: the specifies a RHS-only formula, but really we have an
        # implicit LHS (determined by the event_query). This makes things
        # complicated when it comes to e.g. keeping track of which items
        # survived NA removal, determining the number of rows in an
        # intercept-only formula, etc. Really we want patsy to just treat all
        # this stuff the same way as it normally handles a LHS~RHS
        # formula. So, we convert our RHS formula into a LHS~RHS formula,
        # using a special LHS that represents each event by a placeholder
        # integer!
        desc = ModelDesc.from_formula(rerp_spec.formula, eval_env)
        if desc.lhs_termlist:
            raise ValueError("Formula cannot have a left-hand side")
        desc.lhs_termlist = [Term([_ArangeFactor(len(event_set))])]
        fake_lhs, design = dmatrices(desc, event_set)
        surviving_event_idxes = np.asarray(fake_lhs, dtype=int).ravel()
        design_row_idxes = np.empty(len(event_set))
        design_row_idxes.fill(-1)
        design_row_idxes[surviving_event_idxes] = np.arange(design.shape[0])
        # Now design_row_idxes[i] is -1 if event i was thrown out, and
        # otherwise gives the row in 'design' which refers to event 'i'.
        for i in xrange(len(event_set)):
            event = event_set[i]
            # -1 for non-existent
            design_row_idx = design_row_idxes[i]
            recspan = (event.recording, event.span_id)
            recspan_intern = recspan_intern_table[recspan]
            epoch_start = start_offset + event.start_idx
            epoch_stop = stop_offset + event.start_idx
            if design_row_idx == -1:
                design_row = None
            else:
                design_row = design[design_row_idx, :]
            epoch = _Epoch(epoch_start, epoch_stop - epoch_start, design_row,
                           design_offset, expanded_design_offset, rerp_idx, [])
            if design_row is None:
                # Event thrown out due to missing predictors; this
                # makes its whole epoch into an artifact -- but if overlap
                # correction is disabled, then this artifact only affects
                # this epoch, not anything else. (We still want to treat
                # it as an artifact though so we get proper accounting at
                # the end.)
                epoch.intrinsic_artifacts.append("_MISSING_PREDICTOR")
            spans.append(
                DataSpan((recspan_intern, epoch_start),
                         (recspan_intern, epoch_stop), epoch, None))
        if rerp_spec.name in rerp_names:
            raise ValueError("name %r used for two different sub-analyses" %
                             (rerp_spec.name, ))
        rerp_names.add(rerp_spec.name)
        rerp_info = {
            "spec": rerp_spec,
            "design_info": design.design_info,
            "start_offset": start_offset,
            "stop_offset": stop_offset,
            "design_offset": design_offset,
            "expanded_design_offset": expanded_design_offset,
            "total_epochs": len(event_set),
            "epochs_with_data": 0,
            "epochs_with_artifacts": 0,
        }
        rerp_infos.append(rerp_info)
        design_offset += design.shape[1]
        epoch_samples = stop_offset - start_offset
        expanded_design_offset += epoch_samples * design.shape[1]

    return rerp_infos, spans, design_offset, expanded_design_offset
Ejemplo n.º 57
0
    def predict(self, h=5, oos_data=None, intervals=False):
        """ Makes forecast with the estimated model

        Parameters
        ----------
        h : int (default : 5)
            How many steps ahead would you like to forecast?

        oos_data : pd.DataFrame
            Data for the variables to be used out of sample (ys can be NaNs)

        intervals : boolean (default: False)
            Whether to return prediction intervals

        Returns
        ----------
        - pd.DataFrame with predicted values
        """ 

        if self.latent_variables.estimated is False:
            raise Exception("No latent variables estimated!")
        else:
            _, X_oos = dmatrices(self.formula, oos_data)
            X_oos = np.array([X_oos])[0]
            X_pred = X_oos[:h]

            # Retrieve data, dates and (transformed) latent variables
            mu, Y = self._model(self.latent_variables.get_z_values())         
            date_index = self.shift_dates(h)

            if self.latent_variables.estimation_method in ['M-H']:
                sim_vector = self._sim_prediction_bayes(h, X_pred, 15000)

                forecasted_values = np.array([np.mean(i) for i in sim_vector])
                prediction_01 = np.array([np.percentile(i, 1) for i in sim_vector])
                prediction_05 = np.array([np.percentile(i, 5) for i in sim_vector])
                prediction_95 = np.array([np.percentile(i, 95) for i in sim_vector])
                prediction_99 = np.array([np.percentile(i, 99) for i in sim_vector])

            else:
                t_z = self.transform_z()
                mean_values = self._mean_prediction(mu, Y, h, t_z, X_pred)

                if self.model_name2 == "Skewt":
                    model_scale, model_shape, model_skewness = self._get_scale_and_shape(t_z)
                    m1 = (np.sqrt(model_shape)*sp.gamma((model_shape-1.0)/2.0))/(np.sqrt(np.pi)*sp.gamma(model_shape/2.0))
                    forecasted_values = mean_values[-h:] + (model_skewness - (1.0/model_skewness))*model_scale*m1 
                else:
                    forecasted_values = mean_values[-h:] 

                if intervals is True:
                    sim_values = self._sim_prediction(mu, Y, h, t_z, X_pred, 15000)
                else:
                    sim_values = self._sim_prediction(mu, Y, h, t_z, X_pred, 2)

            if intervals is False:
                result = pd.DataFrame(forecasted_values)
                result.rename(columns={0:self.data_name}, inplace=True)
            else:
                # Get mean prediction and simulations (for errors)
                if self.latent_variables.estimation_method not in ['M-H']:
                    sim_values = self._sim_prediction(mu, Y, h, t_z, X_pred, 15000)
                    prediction_01 = np.array([np.percentile(i, 1) for i in sim_values])
                    prediction_05 = np.array([np.percentile(i, 5) for i in sim_values])
                    prediction_95 = np.array([np.percentile(i, 95) for i in sim_values])
                    prediction_99 = np.array([np.percentile(i, 99) for i in sim_values])

                result = pd.DataFrame([forecasted_values, prediction_01, prediction_05, 
                    prediction_95, prediction_99]).T
                result.rename(columns={0:self.data_name, 1: "1% Prediction Interval", 
                    2: "5% Prediction Interval", 3: "95% Prediction Interval", 4: "99% Prediction Interval"}, 
                    inplace=True)
 
            result.index = date_index[-h:]

            return result
Ejemplo n.º 58
0
import seaborn as sb
sb.pairplot(df[['a','b','c']])             #绘制统计图观察a,b,c列之间的关系


import statsmodels.api as sm

#拟合基础流程
df['intercept'] = 1                        #设置截距列
lm = sm.OLS(df['y'],df[['intercept','x']]) #设置数据集中的自变量和因变量(最小二乘法)
results = lm.fit()                         #拟合模型并储存
results.summary()                          #查看摘要
#转换虚拟变量
df[['A','B','C']] = pd.get_dummies(df['a'])#转换a列虚拟变量设置入新列ABC
lm=sm.Logit(df['y'],df[['intercept','x']]) #逻辑回归中不使用最小二乘法
results = lm.fit()                         #拟合模型并储存
results.summary2()                         #查看摘要
#计算vif值
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor

y,x=dmatrices('price ~ area + bedrooms + bathrooms',df,return_type='dataframe')

vif=pd.DataFrame()
vif['VIF Factor']=[variance_inflation_factor(x.values,i) for i in range(x.shape[1])]
vif['features']=x.columns


import sklearn.preprocessing as p
p.scale(df['a'])                           #获取a列的缩放特征(减去均值并除以标准差)
Ejemplo n.º 59
0
                for e in dy.Exon.unique():
                    ident.append(exoncode[e])
                    exog_vc.append((dy.Exon == e).astype(np.int))

                for p in dy.Person.unique():
                    ident.append(4)
                    exog_vc.append((dy.Person == p).astype(np.int))

                for s in dy.Sample.unique():
                    ident.append(5)
                    exog_vc.append((dy.Sample == s).astype(np.int))

                exog_vc = np.vstack(exog_vc).T
                ident = np.asarray(ident)

                endog, exog = patsy.dmatrices(
                    fml, data=dy, return_type='dataframe')
                vcp_names = [
                    "Gene(Mat)", "Gene(Pat)", "Exon(Mat)", "Exon(Pat)",
                    "Person", "Sample"
                ]
                model = BinomialBayesMixedGLM(
                    endog,
                    exog,
                    exog_vc,
                    ident,
                    vcp_p=3,
                    fe_p=3,
                    vcp_names=vcp_names)

            if kc != 3:
                model2 = BinomialBayesMixedGLM.from_formula(
Ejemplo n.º 60
0
def get_design_matrices(df, dependent_variable, independent_variables, interactions=[]):
    patsy_model = create_patsy_model(dependent_variable, independent_variables, interactions=interactions)
    y, X = dmatrices(patsy_model, df, return_type='dataframe')
    return (y, X)