Example #1
0
    def run_xval_stn(self, stn_id, bw_nngh=100):
        '''
        Run a single leave-one-out cross validation of a geographically
        weighted regression model of a station's monthly and annual normals
        (norm~lst+elev+lon+lat).
        
        Parameters
        ----------
        stn_id : str
            The stn_id for which to run the cross validation
        bw_nngh : int, optional
            The number of neighbors to use for the
            geographically weighted regression. Default: 100.
        
        Returns
        ----------
        err : float
            The difference between predicted and observed
            (predicted minus observed)
        '''

        xval_stn = self.stn_da.stns[self.stn_da.stn_idxs[stn_id]]
        df_xval_stn = self.df_stns.loc[stn_id, :]
        self.stn_slct.set_ngh_stns(xval_stn[LAT],
                                   xval_stn[LON],
                                   bw_nngh,
                                   load_obs=False,
                                   stns_rm=stn_id)
        df_nghs = self.df_stns.loc[self.stn_slct.ngh_stns[STN_ID], :]

        errs = np.empty(13)

        # Errors for monthly normals
        for mth in np.arange(1, 13):

            ls_form = 'norm%.2d~lst%.2d+elevation+longitude+latitude' % (mth,
                                                                         mth)
            ls_fit = sm.wls(ls_form,
                            data=df_nghs,
                            weights=self.stn_slct.ngh_wgt).fit()
            err = ls_fit.predict(df_xval_stn)[0] - df_xval_stn['norm%.2d' %
                                                               mth]
            errs[mth - 1] = err

        # Error for annual normal
        ls_form = 'norm~lst+elevation+longitude+latitude'
        ls_fit = sm.wls(ls_form, data=df_nghs,
                        weights=self.stn_slct.ngh_wgt).fit()
        err = ls_fit.predict(df_xval_stn)[0] - df_xval_stn['norm']
        errs[-1] = err

        return errs
Example #2
0
def _fit_hdd_only(df, weighted=False):

    bps = [i[4:] for i in df.columns if i[:3] == 'HDD']
    best_bp, best_rsquared, best_mod, best_res = None, -9e9, None, None
    best_formula, hdd_qualified = None, False

    try:  # TODO: fix big try block anti-pattern
        for bp in bps:
            candidate_hdd_formula = 'upd ~ HDD_' + bp
            if (np.nansum(df['HDD_' + bp] > 0) < 10) or \
               (np.nansum(df['HDD_' + bp]) < 20):
                continue
            if weighted:
                candidate_hdd_mod = smf.wls(formula=candidate_hdd_formula,
                                            data=df,
                                            weights=df['ndays'])
            else:
                candidate_hdd_mod = smf.ols(formula=candidate_hdd_formula,
                                            data=df)
            candidate_hdd_res = candidate_hdd_mod.fit()
            candidate_hdd_rsquared = candidate_hdd_res.rsquared_adj
            if (candidate_hdd_rsquared > best_rsquared
                    and candidate_hdd_res.params['Intercept'] >= 0
                    and candidate_hdd_res.params['HDD_' + bp] >= 0
                    and candidate_hdd_res.pvalues['HDD_' + bp] < 0.1):
                best_bp, best_rsquared = int(bp), candidate_hdd_rsquared
                best_mod, best_res = candidate_hdd_mod, candidate_hdd_res
                hdd_qualified = True
                best_formula = 'upd ~ HDD_' + bp
    except:  # TODO: catch specific error
        best_rsquared, hdd_qualified = 0, False
        best_formula, best_mod, best_res = None, None, None
        best_bp = None

    return best_formula, best_mod, best_res, best_rsquared, hdd_qualified, best_bp
Example #3
0
def determineTrendWLS(dictParam):
    NLag = dictParam['NLag']
    import statsmodels.formula.api as sm
    df = dictParam['df']

    dfOLS = pd.DataFrame(df['Close'])
    dfOLS['i'] = range(0, dfOLS.index.size)
    dfOLS['weight'] = dfOLS['i']
    dfOLS['weight'] = dfOLS['weight'] - dfOLS['i'].mean()
    dfOLS['weight'] = dfOLS['weight'].apply(lambda x: np.power(abs(x), 2))
    dfOLS['weight'] = dfOLS['weight'] / dfOLS['weight'].sum()
    dfOLS = dfOLS.dropna()

    #wls = sm.wls(formula='Close ~ i', data=dfOLS, weights=dfOLS.weight.values).fit()
    wls = sm.wls(formula='Close ~ i', data=dfOLS,
                 weights=dfOLS.weight).fit(cov_type='HAC',
                                           cov_kwds={'maxlags': NLag})
    #wls = sm.wls(formula='Close ~ i', data=dfOLS, weights=dfOLS.weight).fit(cov_type='HC0')
    t = wls.tvalues['i']

    tThreshold = 2
    if t > tThreshold:
        return 1
    elif t < -tThreshold:
        return -1
    else:
        return 0
Example #4
0
def model_at(formula, **kwargs):
    data = data_at(**kwargs)
    data.dropna(inplace=True)
    print(data)
    print(formula)
    model = smf.wls(formula, weights=data.weight, data=data)
    return model, data
Example #5
0
def _estimate_hour_of_week_occupancy(model_data, threshold):
    index = pd.CategoricalIndex(range(168))
    if model_data.dropna().empty:
        return pd.Series(np.nan, index=index, name="occupancy")

    usage_model = smf.wls(
        formula="meter_value ~ cdd_65 + hdd_50",
        data=model_data,
        weights=model_data.weight,
    )

    model_data_with_residuals = model_data.merge(
        pd.DataFrame({"residuals": usage_model.fit().resid}),
        left_index=True,
        right_index=True,
    )

    def _is_high_usage(df):
        if df.empty:
            return np.nan
        n_positive_residuals = sum(df.residuals > 0)
        n_residuals = float(len(df.residuals))
        ratio_positive_residuals = n_positive_residuals / n_residuals
        return int(ratio_positive_residuals > threshold)

    return (model_data_with_residuals.groupby([
        "hour_of_week"
    ]).apply(_is_high_usage).rename("occupancy").reindex(index).astype(bool)
            )  # guarantee an index value for all hours
Example #6
0
def wls_cluster(formula, df, wt, clt):
    """
    wt      : Weight
    clt     : Cluster
    """
    model = wls(formula=formula, data=df, weights=df[wt])
    reg = model.fit(cov_type='cluster', cov_kwds={'groups': df[clt]})

    return reg
Example #7
0
def WLS(xdata, ydata, xerr):
    ws = pandas.DataFrame({'x': xdata, 'y': ydata})
    weights = pandas.Series(xerr)
    fit = sm.wls('y ~ x', data=ws, weights=1 / weights).fit()
    Int, x = fit.pvalues
    residuals = fit.resid
    rval = fit.rsquared
    residuals = [abs(i) for i in residuals]
    newerr = numpy.sqrt(sum(residuals) / (len(residuals) - 2))
    return round(rval, 2), fit.predict(), round(newerr, 2)
Example #8
0
def get_calibration(data):
    data = data.copy()
    try:
        data['weight'] = data.known_concentration**-2
    except ZeroDivisionError:
        data['weight'] = np.nan
    data = data.replace([np.inf, -np.inf],
                        np.nan).dropna(subset=['weight', 'area'])
    if not len(data) > 1:
        return

    # Deal with presence/absence of an intercept term according to calibration_config
    _intercept = data.intercept.unique()
    assert len(_intercept) == 1
    intercept = _intercept[0]

    try:
        if intercept == 0:
            fit = sm.wls('area ~ known_concentration - 1',
                         data=data,
                         weights=data.weight).fit()
        else:
            fit = sm.wls('area ~ known_concentration',
                         data=data,
                         weights=data.weight).fit()
    except ValueError as err:
        print(data, file=sys.stderr)
        raise err

    out = {}
    if hasattr(fit.params, 'Intercept'):
        out['intercept'] = fit.params.Intercept
        out['slope'] = fit.params[1]
    else:
        out['intercept'] = 0
        out['slope'] = fit.params[0]
    out['limit_of_detection'] = np.nan  # TODO
    out['observations'] = fit.nobs
    out['relative_standard_error'] = relative_standard_error(fit)
    out['rsquared'] = fit.rsquared

    return pd.Series(out)
Example #9
0
 def __get_model_fit(
     self, serie: Optional[int] = None
 ) -> sm.RegressionResultsWrapper:
     if serie is None:
         calibration_data: pd.DataFrame = self.data.calibration_data
     else:
         calibration_data: pd.DataFrame = self.data.get_serie(serie, "calibration")
     return smf.wls(
         formula=self.formula,
         weights=dmatrix(self.weight, calibration_data),
         data=calibration_data,
     ).fit()
Example #10
0
def statsmodels_results(xdata, ydata, xerr=None):
    ws=pandas.DataFrame({'x':xdata, 'y':ydata})
    if xerr!=None:
        weights=pandas.Series(xerr)
        fit=sm.wls('y ~ x', data=ws, weights=1/weights).fit()
    else:
        fit=sm.ols('y ~ x', data=ws).fit()
    Int, x=fit.pvalues
    residuals=fit.resid
    rval=fit.rsquared
    residuals=[abs(i) for i in residuals]
    newerr=numpy.sqrt(sum(residuals)/(len(residuals)-2))
    return fit, round(rval, 2), round(newerr,2)
Example #11
0
def fit_caltrack_hourly_model_segment(segment_name, segment_data):
    def _get_hourly_model_formula(data):
        if (np.sum(data.loc[data.weight > 0].occupancy) == 0) or (np.sum(
                data.loc[data.weight > 0].occupancy) == len(
                    data.loc[data.weight > 0].occupancy)):
            bin_occupancy_interactions = "".join(
                [" + {}".format(c) for c in data.columns if "bin" in c])
            return "meter_value ~ C(hour_of_week) - 1{}".format(
                bin_occupancy_interactions)
        else:
            bin_occupancy_interactions = "".join([
                " + {}:C(occupancy)".format(c) for c in data.columns
                if "bin" in c
            ])
            return "meter_value ~ C(hour_of_week) - 1{}".format(
                bin_occupancy_interactions)

    warnings = []
    if segment_data.dropna().empty:
        model = None
        formula = None
        model_params = None
        warnings.append(
            EEMeterWarning(
                qualified_name=
                "eemeter.fit_caltrack_hourly_model_segment.no_nonnull_data",
                description=
                "The segment contains either an empty dataset or all NaNs.",
                data={
                    "n_rows": segment_data.shape[0],
                    "n_rows_after_dropna": segment_data.dropna().shape[0],
                },
            ))
    else:

        formula = _get_hourly_model_formula(segment_data)
        model = smf.wls(formula=formula,
                        data=segment_data,
                        weights=segment_data.weight)
        model_params = {
            coeff: value
            for coeff, value in model.fit().params.items()
        }

    return CalTRACKSegmentModel(
        segment_name=segment_name,
        model=model,
        formula=formula,
        model_params=model_params,
        warnings=warnings,
    )
def rdestimate(data, y, x, controls=None, cutpoint=0, weights=1):
    """ Wrapper around `smf.wls` to produce `RDestimate`"""
    data["TREATED"] = np.where(data[x] >= cutpoint, 1, 0)
    equation = f"{y} ~ TREATED + {x}"
    if controls is not None:
        if isinstance(controls, list):
            eq_controls = " + ".join(controls)
        elif isinstance(controls, str):
            eq_controls = controls
        else:
            print(type(controls), "controls should be either list or str")
            eq_controls = ""
        equation += eq_controls
    return smf.wls(equation, data=data, weights=weights)
Example #13
0
File: rdd.py Project: divyansha/rdd
def rdd(input_data,
        xname,
        yname=None,
        cut=0,
        equation=None,
        controls=None,
        noconst=False,
        weights=1,
        verbose=True):
    '''
    This function implements a linear regression (ordinary or weighted least squares can be used) for 
        the estimation of regressing the outcome variable on the running variable.  A "TREATED" variable
        is created, the coefficient on which is the causal effect of being to the right of the threshold.

        The user may specify a list of controls to be added linearly, or supply their own equation.  

    INPUT:
        input_data: dataset with outcome and running variables (and potentially controls) (pandas DataFrame)
        xname: name of running variable (string)
        yname: name of outcome variable (string) (default is None - not needed if you include your own equation)
        cut: location of threshold in xname (scalar) (default is 0)
        equation: Estimation equation as a string (see Statsmodels formula syntax for more info)
        controls: List of controls to include in the estimation (list of strings) (not needed if you include your own equation)
        noconst: If True, model does not estimate an intercept (bool) (default is false)
        weights: Weights for weighted least squares (numpy array) (default is equal weights, ie OLS)

    OUTPUT:
        Statsmodels object

    '''
    if yname == None and equation == None:
        raise NameError(
            "You must supply either a outcome variable name or an equation to estimate."
        )
    if 'TREATED' in input_data.columns:
        raise NameError(
            "TREATED is a reserved column name.  Please change the name.")
    data = input_data.copy()  # To avoid SettingWithCopy warnings
    data['TREATED'] = np.where(data[xname] >= cut, 1, 0)
    if equation == None:
        equation = yname + ' ~ TREATED + ' + xname
        if controls != None:
            equation_controls = ' + '.join(controls)
            equation += ' + ' + equation_controls
    if noconst == True:
        equation += ' -1'
    if verbose == True:
        print('Estimation Equation:\t', equation)
    rdd_model = smf.wls(equation, data=data, weights=weights)
    return rdd_model
Example #14
0
def _fit_full(df, weighted=False, billing=False):

    hdd_bps = [i[4:] for i in df.columns if i[:3] == 'HDD']
    cdd_bps = [i[4:] for i in df.columns if i[:3] == 'CDD']

    best_hdd_bp, best_cdd_bp, best_rsquared, best_mod, best_res = \
        None, None, -9e9, None, None
    best_formula, full_qualified = None, False

    try:  # TODO: fix big try block anti-pattern
        for hdd_bp in hdd_bps:
            for cdd_bp in cdd_bps:
                if cdd_bp < hdd_bp:
                    continue
                candidate_full_formula = 'upd ~ CDD_' + cdd_bp + \
                                         ' + HDD_' + hdd_bp
                if not billing:
                    if (np.nansum(df['HDD_' + hdd_bp] > 0) < 10) or \
                       (np.nansum(df['HDD_' + hdd_bp]) < 20):
                        continue
                    if (np.nansum(df['CDD_' + cdd_bp] > 0) < 10) or \
                       (np.nansum(df['CDD_' + cdd_bp]) < 20):
                        continue
                if weighted:
                    candidate_full_mod = smf.wls(
                        formula=candidate_full_formula,
                        data=df,
                        weights=df['ndays'])
                else:
                    candidate_full_mod = smf.ols(
                        formula=candidate_full_formula, data=df)
                candidate_full_res = candidate_full_mod.fit()
                candidate_full_rsquared = candidate_full_res.rsquared_adj
                if (candidate_full_rsquared > best_rsquared
                        and candidate_full_res.params['Intercept'] >= 0
                        and candidate_full_res.params['HDD_' + hdd_bp] >= 0
                        and candidate_full_res.params['CDD_' + cdd_bp] >= 0
                        and candidate_full_res.pvalues['HDD_' + hdd_bp] < 0.1
                        and candidate_full_res.pvalues['CDD_' + cdd_bp] < 0.1):
                    best_hdd_bp, best_cdd_bp, best_rsquared = \
                        int(hdd_bp), int(cdd_bp), candidate_full_rsquared
                    best_mod, best_res = candidate_full_mod, candidate_full_res
                    full_qualified = True
                    best_formula = 'upd ~ CDD_' + cdd_bp + ' + HDD_' + hdd_bp
    except:  # TODO: catch specific error
        best_rsquared, full_qualified = 0, False
        best_formula, best_mod, best_res = None, None, None
        best_hdd_bp, best_hdd_bp = None, None

    return best_formula, best_mod, best_res, best_rsquared, full_qualified, best_hdd_bp, best_cdd_bp
Example #15
0
def rolling_ols(formula: str,
                data: pd.DataFrame,
                window: int,
                r2_adj=False,
                expanding=False,
                robust=False,
                M=sm.robust.norms.AndrewWave()):

    para_res = {}
    r_2_res = {}
    model_sig = {}
    forcast_res = pd.Series([])

    for i in range(len(data) - window + 1):

        if expanding:
            start_index = 0
        else:
            start_index = i

        tmp_df = data.iloc[start_index:i + window]
        forcast_x = data.iloc[i + window:i + window + 1]

        if robust:
            rlm_model = smf.rlm(formula, data=tmp_df, M=M)
            ols_result = smf.wls(formula,
                                 data=tmp_df,
                                 weights=rlm_model.fit().weights).fit()
            # ols_result = sm.WLS(rlm_model.endog, rlm_model.exog,
            #                     weights=rlm_model.fit().weights).fit()
        else:
            ols_result = smf.ols(formula, data=tmp_df).fit()

        para_res[data.index[i + window - 1]] = ols_result.params
        model_sig[data.index[i + window - 1]] = ols_result.f_pvalue

        if r2_adj:
            r_2_res[data.index[i + window - 1]] = ols_result.rsquared_adj
        else:
            r_2_res[data.index[i + window - 1]] = ols_result.rsquared

        # 一步预测
        forcast_res = forcast_res.append(ols_result.predict(forcast_x))

    para_res = pd.DataFrame(para_res).T
    r_2_res = pd.Series(r_2_res)
    model_sig = pd.Series(model_sig)

    return para_res, r_2_res.mean(), model_sig, forcast_res
Example #16
0
def calibrate():
    (times, gaussian_means, gaussian_stds) = find_gaussians()

    weights = 1 / np.power(gaussian_stds, 2)

    # put x and y into a pandas DataFrame, and the weights into a Series
    ws = pd.DataFrame({
        'x': times,
        'y': gaussian_means,
        'yerr': map(lambda x: x * 1000, gaussian_stds)
    })

    wls_fit = sm.wls('x ~ y', data=ws, weights=1 / weights).fit()

    return ((wls_fit.params['y'], wls_fit.params['Intercept']),
            (wls_fit.bse['y'], wls_fit.bse['Intercept']))
Example #17
0
def init_model(df_train,
               df_test,
               model_txt,
               yield_type='rainfed',
               weight=False):
    if weight:
        results = smf.wls(model_txt,
                          data=df_train,
                          missing='drop',
                          weights=df_train['corn_percent']).fit()
    else:
        results = smf.ols(model_txt, data=df_train, missing='drop').fit()
    return results, df_test.copy().join(
        results.predict(df_test).to_frame('Predicted_' +
                                          yield_type_dict[yield_type] +
                                          '_ana'))
Example #18
0
def _fit_intercept(df, weighted=False):
    int_formula = 'upd ~ 1'
    try:
        if weighted:
            int_mod = smf.wls(formula=int_formula,
                              data=df,
                              weights=df['ndays'])
        else:
            int_mod = smf.ols(formula=int_formula, data=df)
        int_res = int_mod.fit()
    except:  # TODO: catch specific error
        int_rsquared, int_qualified = 0, False
        int_formula, int_mod, int_res = None, None, None
    else:
        int_rsquared, int_qualified = 0, True

    return int_formula, int_mod, int_res, int_rsquared, int_qualified
Example #19
0
def fit_caltrack_hourly_model_segment(segment_name, segment_data):
    def _get_hourly_model_formula(data):
        bin_occupancy_interactions = "".join(
            [" + {}:C(occupancy)".format(c) for c in data.columns if "bin" in c]
        )
        return "meter_value ~ C(hour_of_week) - 1{}".format(bin_occupancy_interactions)

    formula = _get_hourly_model_formula(segment_data)
    model = smf.wls(formula=formula, data=segment_data, weights=segment_data.weight)
    model_params = {coeff: value for coeff, value in model.fit().params.items()}
    warnings = []
    return SegmentModel(
        segment_name=segment_name,
        model=model,
        formula=formula,
        model_params=model_params,
        warnings=warnings,
    )
Example #20
0
    def setup_class(cls):
        import statsmodels.formula.api as smf

        data = sm.datasets.cpunish.load_pandas()
        endog = data.endog
        data = data.exog
        data['EXECUTIONS'] = endog
        data['INCOME'] /= 1000
        aweights = np.array(
            [1, 2, 3, 4, 5, 4, 3, 2, 1, 2, 3, 4, 5, 4, 3, 2, 1])
        model = smf.glm(
            'EXECUTIONS ~ INCOME + SOUTH - 1',
            data=data,
            family=sm.families.Gaussian(link=sm.families.links.identity()),
            var_weights=aweights)
        wlsmodel = smf.wls('EXECUTIONS ~ INCOME + SOUTH - 1',
                           data=data,
                           weights=aweights)
        cls.res1 = model.fit(rtol=1e-25, atol=1e-25)
        cls.res2 = wlsmodel.fit()
Example #21
0
def armonic(t, m, f, merr):
    ws = pd.DataFrame({
        'x': m,
        'y1': np.sin(2 * np.pi * t * f),
        'y2': np.cos(2 * np.pi * t * f),
        'y3': np.sin(4 * np.pi * t * f),
        'y4': np.cos(4 * np.pi * t * f),
        'y5': np.sin(6 * np.pi * t * f),
        'y6': np.cos(6 * np.pi * t * f),
        'y7': np.sin(8 * np.pi * t * f),
        'y8': np.cos(8 * np.pi * t * f)
    })
    weights = pd.Series(merr)
    wls_fit = sm.wls('x ~ y1+y2+y3+y4+y5+y6+y7+y8-1',
                     data=ws,
                     weights=1 / weights).fit()
    pred = wls_fit.predict()
    r = m - pred
    A = np.zeros(4)
    PH = np.zeros(4)
    A[0] = np.sqrt(wls_fit.params[0]**2 + wls_fit.params[1]**2)
    A[1] = np.sqrt(wls_fit.params[2]**2 + wls_fit.params[3]**2)
    A[2] = np.sqrt(wls_fit.params[4]**2 + wls_fit.params[5]**2)
    A[3] = np.sqrt(wls_fit.params[6]**2 + wls_fit.params[7]**2)
    PH[0] = np.arctan2(wls_fit.params[1], wls_fit.params[0]) - (
        1 * f / f) * np.arctan2(wls_fit.params[1], wls_fit.params[0])
    PH[1] = np.arctan2(wls_fit.params[3], wls_fit.params[2]) - (
        2 * f / f) * np.arctan2(wls_fit.params[1], wls_fit.params[0])
    PH[2] = np.arctan2(wls_fit.params[5], wls_fit.params[4]) - (
        3 * f / f) * np.arctan2(wls_fit.params[1], wls_fit.params[0])
    PH[3] = np.arctan2(wls_fit.params[7], wls_fit.params[6]) - (
        4 * f / f) * np.arctan2(wls_fit.params[1], wls_fit.params[0])
    influence = inf.OLSInfluence(wls_fit)
    dffits = influence.dffits
    cook = influence.cooks_distance
    leverage = influence.hat_matrix_diag
    inf1 = np.where(dffits[0] > dffits[1])
    inf2 = np.where(cook[1] < 0.05)
    inffin = np.concatenate((inf1, inf2), axis=1)
    return pred, r, A, PH, inffin
Example #22
0
    def setup_class(cls):
        import statsmodels.formula.api as smf

        data = sm.datasets.cpunish.load_pandas()
        endog = data.endog
        data = data.exog
        data['EXECUTIONS'] = endog
        data['INCOME'] /= 1000
        aweights = np.array([1, 2, 3, 4, 5, 4, 3, 2, 1, 2, 3, 4, 5, 4, 3, 2,
                             1])
        model = smf.glm(
                'EXECUTIONS ~ INCOME + SOUTH - 1',
                data=data,
                family=sm.families.Gaussian(link=sm.families.links.identity()),
                var_weights=aweights
        )
        wlsmodel = smf.wls(
                'EXECUTIONS ~ INCOME + SOUTH - 1',
                data=data,
                weights=aweights)
        cls.res1 = model.fit(rtol=1e-25, atol=1e-25)
        cls.res2 = wlsmodel.fit()
Example #23
0
def lm_formula(data, xseq, **params):
    """
    Fit OLS / WLS using a formula
    """
    formula = params['formula']
    eval_env = params['enviroment']
    weights = data.get('weight', None)

    if weights is None:
        init_kwargs, fit_kwargs = separate_method_kwargs(
            params['method_args'], sm.OLS, sm.OLS.fit)
        model = smf.ols(formula, data, eval_env=eval_env, **init_kwargs)
    else:
        if np.any(weights < 0):
            raise ValueError("All weights must be greater than zero.")
        init_kwargs, fit_kwargs = separate_method_kwargs(
            params['method_args'], sm.OLS, sm.OLS.fit)
        model = smf.wls(formula,
                        data,
                        weights=weights,
                        eval_env=eval_env,
                        **init_kwargs)

    results = model.fit(**fit_kwargs)
    data = pd.DataFrame({'x': xseq})
    data['y'] = results.predict(data)

    if params['se']:
        _, predictors = dmatrices(formula, data, eval_env=eval_env)
        alpha = 1 - params['level']
        prstd, iv_l, iv_u = wls_prediction_std(results,
                                               predictors,
                                               alpha=alpha)
        data['se'] = prstd
        data['ymin'] = iv_l
        data['ymax'] = iv_u
    return data
def forward_select_weighted(df, resp_str, maxk, counts):

    remaining = set(df.columns)
    remaining.remove(resp_str)
    selected = []
    numselected = 1
    score_crnt, score_new = 0.0, 0.0
    while remaining and score_crnt == score_new:
        score_array = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(resp_str,
                                           ' + '.join(selected + [candidate]))
            score = smf.wls(formula, df, weights=counts).fit().rsquared_adj
            score_array.append((score, candidate))
        score_array.sort()
        score_new, best_option = score_array.pop()
        if score_crnt < score_new and numselected <= maxk:
            remaining.remove(best_option)
            selected.append(best_option)
            score_crnt = score_new
            numselected += 1
    formula = "{} ~ {} + 1".format(resp_str, ' + '.join(selected))
    model = smf.ols(formula, df).fit()
    return model
Example #25
0
def getBestColumns(df, columns, patsy_string_so_far, for_method, includePripas1):
  best_columns = []
  for x in columns:
    if df[x].nunique() > 20 and for_method == 'ANOVA':
      continue
    # Remove future-looking columns
    if x in (['subsid', 'weight', 'priexp1'] + ([] if includePripas1 else ['pripas1'])):
      continue
    if 'priexp' in x or 'genecon' in x or x == 'downchance':
      continue
    if 'exp' in x or 'brexit_' in x:
      continue
    if for_method in ['DT', 'SVM'] and 'age_grp' == x:
      continue
    formula = 'df.priexp1 ~ ' + patsy_string_so_far + 'C(' + x + ')'
    try:
      lm = wls(formula, df, weights = df.weight).fit()
      if lm.nobs > AT_LEAST_THIS_MANY_OBS:
        best_columns.append([lm.rsquared_adj,x, lm.params])
    except:
      pass #don't handle

  best_columns.sort(reverse = True)
  return best_columns[:(5 if for_method == 'SVM' else 20)]
Example #26
0
# <codecell>

m_regression_data[["PVI", "per_black", "per_hisp", "older_pop", "average_income", 
                   "romney_give", "obama_give", "educ_coll", "educ_hs"]].corr()

# <codecell>

(today - m_regression_data["poll_date"].astype('O'))

# <codecell>

time_weights = (today - m_regression_data["poll_date"].astype('O')).apply(exp_decay)

# <codecell>

m_model = wls("m ~ PVI + per_hisp + per_black + average_income + educ_coll", data=m_regression_data, weights=time_weights).fit()
m_model.summary()

# <codecell>

state_resid = pandas.DataFrame(zip(m_model.resid, m_regression_data.State), 
                               columns=["resid", "State"])

# <codecell>

state_resid_group = state_resid.groupby("State")

# <codecell>

fig, axes = plt.subplots(figsize=(12,8), subplot_kw={"ylabel" : "Residual",
                                                     "xlabel" : "State"})
Example #27
0
castle['lead5'] = castle['time_til'] == -5
castle['lead6'] = castle['time_til'] == -6
castle['lead7'] = castle['time_til'] == -7
castle['lead8'] = castle['time_til'] == -8
castle['lead9'] = castle['time_til'] == -9
castle['lag0'] = castle['time_til'] == 0
castle['lag1'] = castle['time_til'] == 1
castle['lag2'] = castle['time_til'] == 2
castle['lag3'] = castle['time_til'] == 3
castle['lag4'] = castle['time_til'] == 4
castle['lag5'] = castle['time_til'] == 5

formula = "l_homicide ~ r20001 + r20002 + r20003 + r20011 + r20012 + r20013 + r20021 + r20022 + r20023 + r20031 + r20032 + r20033 + r20041 + r20042 + r20043 + r20051 + r20052 + r20053 + r20061 + r20062 + r20063 + r20071 + r20072 + r20073 + r20081 + r20082 + r20083 + r20091 + r20092 + r20093 + lead1 + lead2 + lead3 + lead4 + lead5 + lead6 + lead7 + lead8 + lead9 + lag1 + lag2 + lag3 + lag4 + lag5 + C(year) + C(state)"

event_study_formula = smf.wls(formula, data=castle,
                              weights=castle['popwt']).fit(
                                  cov_type='cluster',
                                  cov_kwds={'groups': castle['sid']})

leads = [
    'lead9[T.True]', 'lead8[T.True]', 'lead7[T.True]', 'lead6[T.True]',
    'lead5[T.True]', 'lead4[T.True]', 'lead3[T.True]', 'lead2[T.True]',
    'lead1[T.True]'
]
lags = [
    'lag1[T.True]', 'lag2[T.True]', 'lag3[T.True]', 'lag4[T.True]',
    'lag5[T.True]'
]

leadslags_plot = pd.DataFrame({
    'sd':
    np.concatenate([
Example #28
0
if False:
    formula_rhs = formula_rhs + " + " + " + ".join(gb_cols)
    formula_rhs = formula_rhs + " + " + " + ".join(elorange_cols)

# hey lets just use the elorange columns and see how they do
#formula_rhs = " + ".join(elorange_cols)

formula = "elo ~ " + " + ".join(rhs_cols)

msg("Fitting!")

weights = np.ones(train.shape[0])

do_statsmodels=True
if do_statsmodels:
    ols = sm.wls(formula=formula, data=train, weights=weights).fit()
    print ols.summary()
    msg("Making predictions for all playergames")
    yy_df['ols_prediction'] = ols.predict(yy_df)
else:
    ols_lr = LassoCV(n_jobs=-1, verbose=True)
    X = train[rhs_cols]
    y = train['elo']
    ols_lr.fit(X,y)
    yy_df['ols_prediction'] = ols_lr.predict(X)

yy_df['ols_error'] = (yy_df['ols_prediction'] - yy_df['elo']).abs()
yy_df['training'] = (yy_df['gamenum'] % 3)
insample_scores = yy_df.groupby('training')['ols_error'].agg({'mean' : np.mean, 'median' : np.median, 'stdev': np.std})
print insample_scores
Example #29
0
import statsmodels.api as sm
from matplotlib import pyplot as plt
from scipy.stats import levene
from statsmodels.stats.anova import anova_lm
import seaborn as sns
import statsmodels.formula.api as smf
from variables import DIR_OUT

if __name__ == "__main__":
    path_df = os.path.join(DIR_OUT, "derived_tables",
                           "nb_streamlines_hemi_level.csv")
    df = pd.read_csv(path_df)
    # sns.lmplot(x='Mesh_Area',y='Nb_Streamlines_Hemi', hue='Hemisphere',data=df, truncate=True,robust=True)

    #
    model = smf.wls("Nb_Streamlines_Hemi ~ Mesh_Area -1", data=df).fit()
    print
    model.summary()
    df["Corrected_Nb_Streamlines_Hemi"] = model.resid
    #  plt.scatter(df['Mesh_Area'].values,df['Nb_Streamlines_Hemi'])
    #  plt.plot(df['Mesh_Area'].values, float(model.params)*(df['Mesh_Area'].values))
    #
    #  plt.show()
    #
    #  # model = smf.ols('Corrected_Nb_Streamlines_Hemi~ PP_CS_Coord_Iso', data=df).fit()
    #  # print model.summary()
    #  # anova = anova_lm(model)
    #  # print summary
    # # print anova
    #  model = smf.ols('Corrected_Nb_Streamlines_Hemi ~ C(Hemisphere)*C(HandednessQ)*C(Gender)*C(AgeQ)',data=df).fit()
    #  print model.summary()
Example #30
0
plt.ylabel("log(Sales)")
plt.title("Log Transformation of y")
plt.scatter(adv.TV, np.log(adv.Sales), alpha=0.3) 
plt.plot(x_prime, y_hat, 'r', linewidth=2, alpha=0.9)

# View the residuals
plt.figure()
plt.scatter(est.predict(adv), est.resid, alpha=0.3)
plt.title("Residuals with Log Transformation of y")
plt.xlabel("Predicted log(Sales)")
plt.ylabel("Residuals")

#####
# Option #2: Weighted least squares
w =  1./(adv.TV)
est_wls = smf.wls(formula='Sales ~ TV', data=adv, weights = w).fit()


# What is the difference?
est = smf.ols(formula='Sales ~ TV', data=adv).fit()
y_hat = est.predict(x_prime)
y_hat_wls = est_wls.predict(x_prime)
plt.xlabel("TV")
plt.ylabel("Sales")
plt.title("OLS (red) vs. WLS (blue")
plt.scatter(adv.TV, adv.Sales, alpha=0.3) 
plt.plot(x_prime, y_hat, 'r', linewidth=2, alpha=0.9)
plt.plot(x_prime, y_hat_wls, 'b', linewidth=2, alpha=0.9)

# What are the pros and cons of these approaches?
Example #31
0
def fit_caltrack_hourly_model_segment(segment_name, segment_data):
    """ Fit a model for a single segment.

    Parameters
    ----------
    segment_name : :any:`str`
        The name of the segment.
    segment_data : :any:`pandas.DataFrame`
        A design matrix for caltrack hourly, of the form returned by
        :any:`eemeter.caltrack_hourly_prediction_feature_processor`.

    Returns
    -------
    segment_model : :any:`CalTRACKSegmentModel`
        A model that represents the fitted model.
    """
    def _get_hourly_model_formula(data):
        if (np.sum(data.loc[data.weight > 0].occupancy) == 0) or (np.sum(
                data.loc[data.weight > 0].occupancy) == len(
                    data.loc[data.weight > 0].occupancy)):
            bin_occupancy_interactions = "".join(
                [" + {}".format(c) for c in data.columns if "bin" in c])
            return "meter_value ~ C(hour_of_week) - 1{}".format(
                bin_occupancy_interactions)
        else:
            bin_occupancy_interactions = "".join([
                " + {}:C(occupancy)".format(c) for c in data.columns
                if "bin" in c
            ])
            return "meter_value ~ C(hour_of_week) - 1{}".format(
                bin_occupancy_interactions)

    warnings = []
    if segment_data.dropna().empty:
        model = None
        formula = None
        model_params = None
        warnings.append(
            EEMeterWarning(
                qualified_name=
                "eemeter.fit_caltrack_hourly_model_segment.no_nonnull_data",
                description=
                "The segment contains either an empty dataset or all NaNs.",
                data={
                    "n_rows": segment_data.shape[0],
                    "n_rows_after_dropna": segment_data.dropna().shape[0],
                },
            ))
    else:

        formula = _get_hourly_model_formula(segment_data)
        model = smf.wls(formula=formula,
                        data=segment_data,
                        weights=segment_data.weight)
        model_params = {
            coeff: value
            for coeff, value in model.fit().params.items()
        }

    segment_model = CalTRACKSegmentModel(
        segment_name=segment_name,
        model=model,
        formula=formula,
        model_params=model_params,
        warnings=warnings,
    )
    if model:
        this_segment_data = segment_data[segment_data.weight == 1]
        predicted_value = pd.Series(model.fit().predict(this_segment_data))
        segment_model.totals_metrics = ModelMetrics(
            this_segment_data.meter_value, predicted_value, len(model_params))
    else:
        segment_model.totals_metrics = None

    return segment_model
t1=(31-2)** 0.5*-0.247984/(1+0.247984** 2)** 0.5
t.cdf(t1,df=29)

#可以从残差图看出明显的异方差

# plt.scatter(res['地区生产总值'], res['residual'])
# plt.show()
#1.加权最小二乘法
#加权最小二乘法,需要构建一个权重。
#python中也无法自动寻找一个合适的m
#所以,只能通过找似然值最小的,作为合适的.
#一般从-2-2试。每隔0.5取一个
#直接取书中的结果2,但是wls内部会自动取倒数。
data['w']=data['地区生产总值'].apply(lambda x:x**-2)

model=smf.wls('财政收入~地区生产总值',data=data,weights=data['w'])
result=model.fit()

result.summary()


res=data
res['residual']=result.resid*(model.weights**0.5)
#做加权残差图
# plt.scatter(res['地区生产总值'], res['residual'])
# plt.show()
#2.BOX-BOX变换
data=pd.read_csv(r"D:/书籍资料整理/应用回归分析/表4-3.csv")

#使用lmbda=None,得出与书中描述不符.所以只能指定lmbda
x_norm = stats.boxcox(data['财政收入'],lmbda=0)
Example #33
0
mae = np.sqrt(
    mean_absolute_error(test6_df['salary'], test6_df['predicted_salary']))
print('Mean Absolute Error: {}'.format(mae))

rms = np.sqrt(
    mean_squared_error(test6_df['salary'], test6_df['predicted_salary']))
print('Mean Squared Error: {}'.format(rms))

## Model 7
## Model 6 using WLS
test7_df = test_df_nooutlines.copy()
train7_df = train_df_nooutlines.copy()
w = np.ones(len(train7_df))
model7 = str('salary ~ conference + wl_ratio + capacity')

train7_fit = statsform.wls(model7, data=train7_df, weights=1. / (w**2)).fit()
train7_df['predicted_salary'] = train7_fit.fittedvalues
test7_df['predicted_salary'] = train7_fit.predict(test7_df)

test_variance7 = round(
    np.power(test7_df['salary'].corr(test7_df['predicted_salary']), 2), 3)
print('Test Set Variance Accounted for: ', test_variance7)

fit7 = statsform.wls(model7, data=train7_df, weights=1. / (w**2)).fit()
print(fit7.summary())

## Model 8
## Model 6 using GLS
test8_df = test_df_nooutlines.copy()
train8_df = train_df_nooutlines.copy()
__author__ = 'Yas'
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
import pandas as pd
x_list = [1,2,3,4,5,6,7]
y_list = [1,2,3,1,5,6,7]
y_wts = [0.1,0.1,0.1,0.001,0.1,0.1,0.1]
# put x and y into a pandas DataFrame, and the weights into a Series
ws = pd.DataFrame({
    'x': x_list,
    'y': y_list
})
weights = pd.Series(y_wts)

wls_fit = sm.wls('x ~ y', data=ws, weights=1 / weights).fit()
ols_fit = sm.ols('x ~ y', data=ws).fit()

# show the fit summary by calling wls_fit.summary()
# wls fit r-squared is 0.754
# ols fit r-squared is 0.701

# let's plot our data
plt.clf()
fig = plt.figure()
ax = fig.add_subplot(111, axisbg='w')
ws.plot(
    kind='scatter',
    x='x',
    y='y',
    style='o',
formula_rhs = formula_rhs + " + " + " + ".join(material_features)

if False:
    formula_rhs = formula_rhs + " + " + " + ".join(gb_cols)
    formula_rhs = formula_rhs + " + " + " + ".join(elorange_cols)

# hey lets just use the elorange columns and see how they do
#formula_rhs = " + ".join(elorange_cols)


msg("Fitting!")

weights = np.ones(train.shape[0])

formula = "elo_avg ~ " + formula_rhs
ols_avg = sm.wls(formula=formula, data=train, weights=weights).fit()
print ols_avg.summary()

formula = "elo_advantage ~ " + formula_rhs
ols_ea = sm.wls(formula=formula, data=train, weights=weights).fit()
print ols_ea.summary()


msg("Making predictions for all playergames")
yy_df['ols_avg_prediction'] = ols_avg.predict(yy_df)
yy_df['ols_ea_prediction'] = ols_ea.predict(yy_df)

yy_df['ols_avg_error'] = (yy_df['ols_avg_prediction'] - yy_df['elo_avg']).abs()
yy_df['ols_ea_error'] = (yy_df['ols_ea_prediction'] - yy_df['elo_advantage']).abs()
yy_df['training'] = (yy_df['gamenum'] % 3)
insample_scores = yy_df.groupby('training')['ols_avg_error'].agg({'mean' : np.mean, 'median' : np.median, 'stdev': np.std})
    def optimise_combination(self):
        """
        Use multiple linear regression to determine the optimal weighted
        combination of the GEOGRAPHIC, GENETIC and FEATUE methods.
        """

        df = {}
        df["auth"] = self.common_auth_combo_vector

        names = ("geo", "gen", "feat")
        funcs = (distance.build_optimal_geographic_matrix,
                    distance.build_optimal_genetic_matrix,
                    distance.build_optimal_feature_matrix)
        for name, func in zip(names, funcs):
            austro_method = self.compute_method_vector(func(self.austrolangs), self.common_austro_langs, self.wals_austro_trans)
            indo_method =  self.compute_method_vector(func(self.indolangs), self.common_indo_langs, self.wals_indo_trans)
            df[name] = np.concatenate([austro_method, indo_method])

        df = pd.DataFrame(df)
        df.to_csv("calibration_results/feature_data.csv")
        model = smf.wls('auth ~ geo + gen + feat', data=df, weights=self.weights).fit()

        fp = open("calibration_results/optimal_combination_weights", "w")
#        fp.write("intercept\t%f\n" % model.params["Intercept"])
        fp.write("intercept\t%f\n" % 0.0)
        fp.write("geo\t%f\n" % model.params["geo"])
        fp.write("gen\t%f\n" % model.params["gen"])
        fp.write("feat\t%f\n" % model.params["feat"])
        fp.close()

#        return (model.params["Intercept"], model.params["geo"], model.params["gen"], model.params["feat"])

        combo_austro = distance.build_optimal_combination_matrix(self.austrolangs)
        combo_indo = distance.build_optimal_combination_matrix(self.indolangs)
        D, intt, mult = self.fit_models(combo_austro, combo_indo, "combo")
        print "best combo D: ", D

        fp = open("calibration_results/optimal_combination_weights", "w")
        fp.write("intercept\t%f\n" % intt)
        print intt
        fp.write("geo\t%f\n" % (mult*model.params["geo"]))
        print mult*model.params["geo"]
        fp.write("gen\t%f\n" % (mult*model.params["gen"]))
        print mult*model.params["gen"]
        fp.write("feat\t%f\n" % (mult*model.params["feat"]))
        print mult*model.params["feat"]
        fp.close()

        return

        return (best_intercept, best_weights[0], best_weights[1], best_weights[2])
        old_D = 1000
        lowest_D = 1000
        weights = [1.0/3, 1.0/3, 1.0/3]
        best_weights = weights[:]
        intercept = 0.5
        best_intercept = 0.5
        for iterations in range(0,10000):
            oldweights = weights[:]
            oldint = intercept
            # change params
            if random.randint(1,100) == 42:
                # Go back to best so far
                weights = best_weights[:]
                intercept = best_intercept
            elif random.randint(1,3) == 1:
                # shuffle weights
                random.shuffle(weights)
            elif random.randint(1,3) == 2:
                # shift weights
                source, target = random.sample([0,1,2],2)
                delta = random.sample([0.01, 0.05, 0.1, 0.2],1)[0]
                if weights[source] > delta:
                    weights[source] -= delta
                    weights[target] += delta
            elif random.randint(1,3) == 3:
                # shift intercept
                delta = random.sample([0.01, 0.05, 0.1, 0.2],1)[0]
                if random.randint(1,2) == 1 and intercept >= delta:
                    intercept -= delta
                elif intercept <= 1.0 - delta:
                    intercept += delta

            observations = [weights[0]*a + weights[1]*b + weights[2]*c for a, b, c in itertools.izip(geo, gen, feat)]
            D, p = scipy.stats.kstest(observations, baselinecdf)
            if D < old_D or random.randint(1,100) < 20:
                # We've improved, or it's a rare backward step
                old_D = D
            else:
                # Keep old value
                weights = oldweights[:]
                intercept = oldint
            if D < lowest_D:
                lowest_D = D
                best_weights = weights
                best_intercept = intercept

#        df = {}
#        df["auth"] = self.auth_combo_vector
#        df["geo"] = np.concatenate([geo_austro, geo_indo])
#        df["gen"] = np.concatenate([gen_austro, gen_indo])
#        df["feat"] = np.concatenate([feat_austro, feat_indo])
#        df = pd.DataFrame(df)
#        df.to_csv("calibration_results/combination_data.csv")
#        model = smf.ols('auth ~ geo + gen + feat', data=df).fit()
#        weights = [model.params[x] for x in ("geo", "gen", "feat")]

        fp = open("calibration_results/optimal_combination_weights", "w")
        fp.write("intercept\t%f\n" % best_intercept)
        fp.write("geo\t%f\n" % best_weights[0])
        fp.write("gen\t%f\n" % best_weights[1])
        fp.write("feat\t%f\n" % best_weights[2])
        fp.close()

        return (best_intercept, best_weights[0], best_weights[1], best_weights[2])
    def optimise_feature(self):

        conn = sqlite3.connect("../WALS2SQL/wals.db")
        cursor = conn.cursor()
        cursor.execute('''PRAGMA cache_size = -25000''')

        wals2sql.compute_dense_features(conn, cursor, 25)
        dense_features = wals2sql.get_dense_features(conn, cursor)
        cursor.close()
        conn.close()
        comparators = distance.build_comparators()

        # Ugly hack
        langs_by_name = {}
        for lang in self.austrolangs:
            langs_by_name[lang.name] = lang
        for lang in self.indolangs:
            langs_by_name[lang.name] = lang

        # Identify good features
        good_features = []
        long_good_features = []
        for index, feature in enumerate(dense_features):
            if feature == bwo:
                continue
            for l1, l2 in itertools.chain(itertools.combinations(self.common_austro_langs, 2), itertools.combinations(self.common_indo_langs, 2)):
                l1 = langs_by_name[l1]
                l2 = langs_by_name[l2]
                useful_points = 0
                if feature in l1.data and feature in l2.data:
                    useful_points += 1
            if useful_points > 0:
                good_features.append("feat%d" % index)
                long_good_features.append(feature)

        # Compute supermeans
        austromeans = {}
        austrosupermean = 0
        austrosupernorm = 0
        for feature in long_good_features:
            austromeans[feature] = 0
            norm = 0
            for l1, l2 in itertools.combinations(self.common_austro_langs, 2):
                l1 = langs_by_name[l1]
                l2 = langs_by_name[l2]
#                pdb.set_trace()
                if feature in l1.data and feature in l2.data:
                    austromeans[feature] += comparators[feature](l1.data[feature], l2.data[feature])
                    norm += 1
            if norm:
                austromeans[feature] /= norm
                austrosupermean += austromeans[feature]
                austrosupernorm += 1
            else:
                austromeans[feature] = "NODATA"
        if austrosupernorm:
            austrosupermean /= austrosupernorm
        else:
            austrosupermean = 0.5
        for feature in austromeans:
            if austromeans[feature] == "NODATA":
                austromeans[feature] = austrosupermean

        indomeans = {}
        indosupermean = 0
        indosupernorm = 0
        for feature in long_good_features:
            indomeans[feature] = 0
            norm = 0
            for l1, l2 in itertools.combinations(self.common_indo_langs, 2):
                l1 = langs_by_name[l1]
                l2 = langs_by_name[l2]
                if feature in l1.data and feature in l2.data:
                    indomeans[feature] += comparators[feature](l1.data[feature], l2.data[feature])
                    norm += 1
            if norm:
                indomeans[feature] /= norm
                indosupermean += indomeans[feature]
                indosupernorm += 1
            else:
                indomeans[feature] = "NODATA"
        if indosupernorm:
            indosupermean /= indosupernorm
        else:
            indosupermean = 0.5
        for feature in indomeans:
            if indomeans[feature] == "NODATA":
                indomeans[feature] = indosupermean

        # Actually compute raw data
        df = {}
        df["auth"] = self.common_auth_combo_vector
        for feature, long_feature in zip(good_features, long_good_features):
            if long_feature == bwo:
                continue
            df[feature] = []
            for l1, l2 in itertools.chain(itertools.combinations(self.common_austro_langs, 2), itertools.combinations(self.common_indo_langs, 2)):
                l1 = langs_by_name[l1]
                l2 = langs_by_name[l2]
                if long_feature in l1.data and long_feature in l2.data:
                    df[feature].append(comparators[long_feature](l1.data[long_feature], l2.data[long_feature]))
                else:
                    if l1 in self.austrolangs:
                        df[feature].append(austromeans[long_feature])
                    else:
                        df[feature].append(indomeans[long_feature])

        df = pd.DataFrame(df)
        df.to_csv("calibration_results/feature_data.csv")

        austrodf = df[0:len(self.common_auth_austro_vector)]
        indodf = df[len(self.common_auth_austro_vector):]

        # Optimise for a fixed length of time
        rank = []
        starttime = time.time()
        while (time.time() - starttime) < 30*60:
            # Generate a random binary vector indicating which
            # features are and are not in the model
            on_features = random.randint(1, len(good_features))
            feature_selectors = [True,]*on_features + [False,]*(len(good_features)-on_features)
            random.shuffle(feature_selectors)

            # Fit a model using the randomly selected features
            model_spec = "auth ~ " + " + ".join([feat for feat, sel in zip(good_features, feature_selectors) if sel])
            model = smf.wls(model_spec, data=df, weights=self.weights).fit()
            # Compute correlations for the two families
            # separately
            austrofit = model.fittedvalues[0:len(self.common_auth_austro_vector)]
            austroauth = austrodf["auth"]
            austro_correl = austroauth.corr(austrofit)
            indofit = model.fittedvalues[len(self.common_auth_austro_vector):]
            indoauth = indodf["auth"]
            indo_correl = indoauth.corr(indofit)
            # Record pertinent details in a big list
            min_correl = min(austro_correl, indo_correl)
            thingy = (min_correl, austro_correl, indo_correl, feature_selectors.count(True), feature_selectors)
            rank.append(thingy)
            if len(rank) == 50000:
                # List is getting kind of long
                # Let's keep the best 10% and ditch the rest,
                # then keep going...
                rank.sort()
                rank.reverse()
                rank = rank[0:5000]

        # Find the highest min correlation 
        rank.sort()
        rank.reverse()
        best_min_correl = rank[0][0]
        # Now, filter rank to include only those models with
        # a min correlation within 5% of the best possible
        # and rank them by number of features in model, finding
        # the highest filter count
        rank = [(c,m,a,i,s) for (m,a,i,c,s) in rank if m>=0.95*best_min_correl]
        rank.sort()
        rank.reverse()
        highest_count = rank[0][0]
        # Now, filter rank to include only those models with
        # the highest number of features, and rank them by
        # min correlation
        rank = [(m,a,i,c,s) for (c,m,a,i,s) in rank if c == highest_count]
        rank.sort()
        rank.reverse()
        # Take the best
        best_selectors = rank[0][-1]
        best_features = [feat for feat, sel in zip(good_features, best_selectors) if sel]
        model_spec = "auth ~ " + " + ".join(best_features)
        model = smf.wls(model_spec, data=df, weights=self.weights).fit()
        weights = {}
        for index, feature in enumerate(dense_features):
            if "feat%d" % index in best_features:
                weights[feature] = model.params["feat%d" % index]
                print index, weights[feature]

        func = distance.feature_matrix_factory(weights)
        D, intercept, mult = self.evaluate_method(func, "feat")

        fp = open("calibration_results/optimal_feature_weights", "w")
        fp.write("%f\tintercept\n" % (intercept))
        for index, feature in enumerate(dense_features):
            if "feat%d" % index in best_features:
                fp.write("%f\t%s\n" % (mult*model.params["feat%d" % index], feature))
        fp.close()

        print "Best feature D: ", D
        autoCorr.append(m.log(abs(np.real(densTimeSeries[(int(L/2) + 2)][i] - avDens[(int(L/2) + 2)][0]))))
        trivWeights.append(1.0)
        corrInfo.append(str(i*10.0/float(numTimeSlices-1))+" "+str(m.log(abs(np.real(densTimeSeries[(int(L/2) + 2)][i] - avDens[(int(L/2) + 2)][0]))))+"\n")

y_list = timeSeries
x_list = autoCorr
y_err = trivWeights

# put x and y into a pandas DataFrame, and the weights into a Series
ws = pd.DataFrame({
    'x': x_list,
    'y': y_list
})
weights = pd.Series(trivWeights)

wls_fit = sm.wls('x ~ y', data=ws, weights=1.0 / ((weights)**2)).fit()
#ols_fit = sm.ols('x ~ y', data=ws).fit()

#print avDens
#print("\nThe mean current should be:\n")
avCurr = cscCurrentMatrix.dot(vecsLR)
#print avCurr


with open(resultsPlace+'eigenvalues.dat', 'w') as f:
    for eig in valsLR:
        f.write(str(np.real(eig))+'\n')

#with open(resultsPlace+'fullEigenvalues.dat', 'w') as f:
#    for eig in vals:
#        f.write(str(eig)+'\n')