Ejemplo n.º 1
0
    def regression_analysis(self, y_column: str, *x_column: str) -> dict:
        """回归分析(OLS)

        :param y_column: y值所在的列名
        :param x_column: x值所在的列名
        :return: 字典,包括参数、检验结果
        """
        X_turple = (np.array(self.data[x_column[0]]), )
        for i in range(1, len(x_column)):
            for column_info in self.meta:
                if "{}. {}".format(column_info['index'], column_info['title']) == x_column[i]:
                    if column_info['type'] in ['rate', 'scale', 'numInput']:  # 连续性变量直接插入矩阵
                        X_turple += (np.array(self.data[x_column[i]]), )
                    elif column_info['type'] in ["radio", "checkbox", "sort"]:  # 分类变量转化为虚拟变量后删去参照组插入矩阵
                        dummy = sm.categorical(np.array(self.data[x_column[i]]))
                        X_turple += (dummy[:, 1:], )
                    break
        X = np.column_stack(X_turple)
        X = sm.add_constant(X)
        y = np.array(self.data[y_column])
        model = sm.OLS(y, X)
        result = model.fit()
        result_dict = dict()
        result_dict['params'] = [round(i, 3) for i in result.params]  # [常数, x1, x2, ...]
        result_dict['tvalues'] = [round(i, 3) for i in result.tvalues]
        result_dict['pvalues'] = [round(i, 3) for i in result.pvalues]
        result_dict['rsquared'] = round(result.rsquared, 3)
        result_dict['rsquared_adj'] = round(result.rsquared_adj, 3)
        result_dict['fvalue'] = round(result.fvalue, 3)
        result_dict['f_pvalue'] = round(result.f_pvalue, 3)
        result_dict['DW'] = round(durbin_watson(result.wresid), 3)
        result_dict['condition_number'] = round(result.condition_number)
        if np.isnan(result_dict['f_pvalue']):
            return None
        return result_dict
Ejemplo n.º 2
0
    def make_dummy_vars(self):
        self.groups = np.zeros(self.n_samples, int)
        self.groups[self.dummy_slices[0]:self.dummy_slices[1]] = 1
        self.groups[self.dummy_slices[1]:] = 2
        self.dummy = sm.categorical(self.groups, drop=True)

        self.x = np.linspace(self.x_start, self.x_stop, self.n_samples)
        # drop reference category
        X = np.column_stack((self.x, self.dummy[:, 1:]))
        self.X = sm.add_constant(X, prepend=False)

        self.y_true = np.dot(self.X, self.beta)
        print("y_true.shape: ", self.y_true.shape)

        # introducing independent error term
        e = np.random.normal(size=self.n_samples)
        self.y = self.y_true + e
        print("y.shape: ", self.y.shape)

        # inspect the data
        print("groups:", self.groups)
        print("dummy (head) :", self.dummy[:5, :])
        print("X     :", self.X[:5, :])
        print("y     :", self.y[:5])
        return self.y, self.X
Ejemplo n.º 3
0
    def categorical(self, data: pd.DataFrame, key='industry'):
        """

        pd.DataFrame


        index : stock

        columns : key must be some of the columns, default: industry

                 industry    factor_i   ....   marketvalue
        stock1      A
        stock2      B


        return panel data

                    农林牧渔   非银金融   industry3 industry4 .....   industry29
        stock1          0       1           0       0               0
        stock2          1       0           0       0               0


        """
        return pd.DataFrame(sm.categorical(data[key].values, drop=True),
                            index=data.index,
                            columns=data[key].sort_values().unique())
Ejemplo n.º 4
0
def industry_dict_weigth():
    df = DataAPI.IndustryGet(industryVersion=u"SW",
                             industryVersionCD=u"",
                             industryLevel=u"1",
                             isNew=u"1",
                             field=u"",
                             pandas="1")
    industryList = df['industryID'].tolist(
    )  # .remove(1030322)#.extend(['103032201', '103032202', '103032203'])
    del industryList[21]
    industryList.extend(["0103032201", "0103032202", "0103032203"])
    industryList.sort()  # 按行业代码排序
    dummy = sm.categorical(np.array(industryList), drop=True)
    industryDict = dict(zip(industryList, dummy))
    weightBaseDf = pd.DataFrame(
        dict(zip(industryList, np.zeros([len(industryList), 1])))).T
    weightBaseDf.rename(columns={0: 'weight0'}, inplace=True)
    weightBaseDf.index.name = 'industryID1'
    return industryDict, weightBaseDf
Ejemplo n.º 5
0
	def load_file(file_path):
		"""input: file_path: the path to the data file
		   output: X: array of independent variables values, y: array of the dependent variable values
		"""
		data = pd.read_csv(file_path, delimiter="\t")
		df = pd.DataFrame(data)

		y = df["taxi_records"]
		X = df.drop(["lat", "long", "station_id", "station_name", "year", "zipcode", "taxi_records"], axis=1)

		# create dummy variables from month
		dummy_month = sm.categorical(X["month_beginning"], drop=True)
		dummy_month_df = pd.DataFrame(dummy_month)

		# drop one of the columns to have linear independence
		dummy_month_df = dummy_month_df.drop(dummy_month_df.columns[0], axis=1)
		# rename for human legibility
		dummy_month_df.columns = ["february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"]

		# drop original categorical columns
		X = X.drop(["month_beginning"], axis=1)
		frames = [X, dummy_month_df]
		# concatenate dummy variables onto X dataframe
		X = pd.concat(frames, axis=1)

		print(X.head)

		#print(X.dtypes)

		# plt.hist(X['population'], bins=10)
		# plt.xlabel("population")
		# plt.ylabel("number of stations whose zipcode has that population")
		# plt.show()

		for i in range(len(X.columns.values)):
			print(i + 1, X.columns.values[i])

		y = y.to_numpy()
		X = X.to_numpy()

		return X, y
Ejemplo n.º 6
0
    def singal_factor_test(self,data):
        ##数据清洗
        data = data.dropna()
        data = data[data[factor.upper()] < data[factor.upper()].quantile(.95)]
        data = data[data["PCT_CHG_PER"] < data["PCT_CHG_PER"].quantile(.98)]
        data = data[data[factor.upper()] > data[factor.upper()].quantile(.05)]
        data = data[data["PCT_CHG_PER"] > data["PCT_CHG_PER"].quantile(.02)]
        data = data[data["PCT_CHG_PER"] != 0]
        a = data[factor.upper()].map(
            lambda x: x / (data[factor.upper()].max() - data[factor.upper()].min())).values
        data["%s_1" % factor.upper()] = a

        data["%s_2" % factor.upper()] = list(data[factor.upper()].map(lambda x: log(x)))

        ##总览
        ## data.plot.scatter(y='PCT_CHG_PER', x="%s_2" %factor.upper())
        ## from pandas.plotting import scatter_matrix
        ## scatter_matrix(data.iloc[:, 0:3], alpha=0.2, figsize=(6, 6), diagonal='kde')
        ## bp = data.groupby('INDUSTRY_SW')["PCT_CHG_PER"]

        ##因子分组
        def cla(n, lim):
            return '[%.f ,%.f)' % (lim * (n // lim), lim * (n // lim) + lim)

        b = data[factor.upper()].apply(cla, args=(20,)).values
        data["%s_3" % factor.upper()] = b
        ##data.hist(by="%s_3" % factor.upper(), column="PCT_CHG_PER")
        group_industry = data.groupby('INDUSTRY_SW')
        #group_PE = data.groupby(['INDUSTRY_SW', 'VAL_PE_DEDUCTED_TTM_2', ])
        #group_industry.boxplot(column="PCT_CHG_PER")
        ##WLS拟合
        data = data.dropna()
        industrys = data['INDUSTRY_SW']
        x2 = np.array(list(industrys), dtype=np.str)
        dummy = sm.categorical(x2, drop=True)  # 得到申万一级行业虚拟变量

        x1 = np.array(list(data["%s_1" % factor.upper()]))
        x = np.column_stack((x1, dummy))  # 合并回归所需自变量
        y = np.array(list(data["PCT_CHG_PER"]))  # 得到回归所需的因变量
        results = sm.WLS(y, x, weights=list(data.values.T[3])).fit()
        return results,data
	def load_file(file_path):
		"""input: file_path: the path to the data file
		   output: X: array of independent variables values, y: array of the dependent variable values
		"""
		#TODO:
		#1. Use pandas to load data from the file. Here you can also re-use most of the code from part I.
		#2. Select which independent variables best predict the dependent variable count.
		data = pd.read_csv(file_path, delimiter="\t")
		df = pd.DataFrame(data)

		y = df["taxi_records"]
		X = df.drop(["lat", "long", "station_id", "station_name", "year", "zipcode", "taxi_records"], axis=1)

		# create dummy variables from month
		dummy_month = sm.categorical(X["month_beginning"], drop=True)
		dummy_month_df = pd.DataFrame(dummy_month)

		# drop one of the columns to have linear independence
		dummy_month_df = dummy_month_df.drop(dummy_month_df.columns[0], axis=1)
		# rename for human legibility
		dummy_month_df.columns = ["february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"]

		# drop original categorical columns
		X = X.drop(["month_beginning"], axis=1)
		frames = [X, dummy_month_df]
		# concatenate dummy variables onto X dataframe
		X = pd.concat(frames, axis=1)

		print(X.head)

		print(X.dtypes)

		for i in range(len(X.columns.values)):
			print(i + 1, X.columns.values[i])

		y = y.to_numpy()
		X = X.to_numpy()

		return X, y
Ejemplo n.º 8
0
ax.plot(x, iv_u, 'r--')
ax.plot(x, iv_l, 'r--')
ax.legend(loc='best');


# ## OLS with dummy variables
# 
# We generate some artificial data. There are 3 groups which will be modelled using dummy variables. Group 0 is the omitted/benchmark category.

nsample = 50
groups = np.zeros(nsample, int)
groups[20:40] = 1
groups[40:] = 2
#dummy = (groups[:,None] == np.unique(groups)).astype(float)

dummy = sm.categorical(groups, drop=True)
x = np.linspace(0, 20, nsample)
# drop reference category
X = np.column_stack((x, dummy[:,1:]))
X = sm.add_constant(X, prepend=False)

beta = [1., 3, -3, 10]
y_true = np.dot(X, beta)
e = np.random.normal(size=nsample)
y = y_true + e


# Inspect the data:

print(X[:5,:])
print(y[:5])
Ejemplo n.º 9
0
import pandas as pd
import numpy as np
import statsmodels.api as sm


df = pd.read_csv('dataset_ps_4.csv')
print(df)

target_base = np.array(df.iloc[:,1].values).reshape(10000,1)
data_base = np.array(df.iloc[:,2:9].values)
data_base = np.delete(data_base,3,axis = 1)
data_base = np.delete(data_base,-1,axis = 1)
data_base = np.delete(data_base,0,axis = 1)
region = df.iloc[:,5].values
exp = np.array(df.iloc[:,-1].values).reshape(10000,1)
region_dummy = sm.categorical(region,drop = True)
data_base = np.hstack((data_base,region_dummy,exp))
data_base = np.delete(data_base,-2,axis=1)
data_base_df = pd.DataFrame(data_base)
data_base_df.columns = ['ability','age','female','edu','reg1','reg2','reg3','exp']
# print(data_base_df)



"""(a)"""
sum_data = df.iloc[:,1:].values
#mean
sum_data_mean = [np.mean(sum_data[:,i]) for i in range(sum_data.shape[1])]
sum_data_std = [np.std(sum_data[:,i]) for i in range(sum_data.shape[1])]
sum_data_min = [np.min(sum_data[:,i]) for i in range(sum_data.shape[1])]
sum_data_max = [np.max(sum_data[:,i]) for i in range(sum_data.shape[1])]
Ejemplo n.º 10
0
from patsy.contrasts import Treatment
levels = [1,2,3,4]
contrast = Treatment(reference=0).code_without_intercept(levels)
print(contrast.matrix)


# Here we used `reference=0`, which implies that the first level, Hispanic, is the reference category against which the other level effects are measured. As mentioned above, the columns do not sum to zero and are thus not independent of the intercept. To be explicit, let's look at how this would encode the `race` variable.

hsb2.race.head(10)


print(contrast.matrix[hsb2.race-1, :][:20])


sm.categorical(hsb2.race.values)


# This is a bit of a trick, as the `race` category conveniently maps to zero-based indices. If it does not, this conversion happens under the hood, so this won't work in general but nonetheless is a useful exercise to fix ideas. The below illustrates the output using the three contrasts above

from statsmodels.formula.api import ols
mod = ols("write ~ C(race, Treatment)", data=hsb2)
res = mod.fit()
print(res.summary())


# We explicitly gave the contrast for race; however, since Treatment is the default, we could have omitted this.

#### Simple Coding

# Like Treatment Coding, Simple Coding compares each level to a fixed reference level. However, with simple coding, the intercept is the grand mean of all the levels of the factors. Patsy doesn't have the Simple contrast included, but you can easily define your own contrasts. To do so, write a class that contains a code_with_intercept and a code_without_intercept method that returns a patsy.contrast.ContrastMatrix instance
Ejemplo n.º 11
0
ax.plot(x, res.fittedvalues, 'r--.', label="OLS")
ax.plot(x, iv_u, 'r--')
ax.plot(x, iv_l, 'r--')
ax.legend(loc='best')

# ## OLS with dummy variables
#
# We generate some artificial data. There are 3 groups which will be modelled using dummy variables. Group 0 is the omitted/benchmark category.

nsample = 50
groups = np.zeros(nsample, int)
groups[20:40] = 1
groups[40:] = 2
#dummy = (groups[:,None] == np.unique(groups)).astype(float)

dummy = sm.categorical(groups, drop=True)
x = np.linspace(0, 20, nsample)
# drop reference category
X = np.column_stack((x, dummy[:, 1:]))
X = sm.add_constant(X, prepend=False)

beta = [1., 3, -3, 10]
y_true = np.dot(X, beta)
e = np.random.normal(size=nsample)
y = y_true + e

# Inspect the data:

print(X[:5, :])
print(y[:5])
print(groups)
Ejemplo n.º 12
0
from patsy.contrasts import Treatment
levels = [1, 2, 3, 4]
contrast = Treatment(reference=0).code_without_intercept(levels)
print(contrast.matrix)

# Here we used `reference=0`, which implies that the first level,
# Hispanic, is the reference category against which the other level effects
# are measured. As mentioned above, the columns do not sum to zero and are
# thus not independent of the intercept. To be explicit, let's look at how
# this would encode the `race` variable.

hsb2.race.head(10)

print(contrast.matrix[hsb2.race - 1, :][:20])

sm.categorical(hsb2.race.values)

# This is a bit of a trick, as the `race` category conveniently maps to
# zero-based indices. If it does not, this conversion happens under the
# hood, so this will not work in general but nonetheless is a useful exercise
# to fix ideas. The below illustrates the output using the three contrasts
# above

from statsmodels.formula.api import ols
mod = ols("write ~ C(race, Treatment)", data=hsb2)
res = mod.fit()
print(res.summary())

# We explicitly gave the contrast for race; however, since Treatment is
# the default, we could have omitted this.
Ejemplo n.º 13
0
# 2        Married  <=50K           0.00000
# 3        Married  <=50K           0.00000
# 4        Married  <=50K           0.00000
# =============================================================================
# =============================================================================
# For simplicity, we save the Income variable as y.
# y = adult_tr[['Income']]
y = adult_tr[['Income']]
# y was created with only one column, [18761 rows x 1 columns]
# We have a categorical variable, Marital status, among our predictors.
# The CART model implemented in the sklearn package needs categorical variables converted to a dummy variable form.
# Thus, we will make a series of dummy variables for Marital status using the categorical() command.
# =============================================================================
mar_np = np.array(adult_tr['Marital status'])
#mar_np created - We turn the variable Marital status into an array using array(),
mar_cat = sm.categorical(mar_np, drop=True)
mar_cat_dict = stattools.categorical(mar_np, dictnames=True)
#Now, we need to add the newly made dummy variables back into the X variables.
mar_cat_pd = pd.DataFrame(mar_cat)
#we converted the mar_cat matrix into a data frame using the DataFrame() command
X = pd.concat((adult_tr[['Cap_Gains_Losses']], mar_cat_pd), axis=1)
# =============================================================================
# We then use the concat() command to attach the predictor variable Cap_Gains_Losses to
# the data frame of dummy variables that represent marital status. We save the result as X.
# =============================================================================
# =============================================================================
# Data is like this
# 18749          0.000000  0.0  1.0  0.0  0.0  0.0
# 18750          0.010550  0.0  0.0  1.0  0.0  0.0
# 18751          1.000000  0.0  1.0  0.0  0.0  0.0
# 18752          0.362489  0.0  1.0  0.0  0.0  0.0
Ejemplo n.º 14
0
xomNasdaqOilModel = linear_model.LinearRegression()
xomNasdaqOilModel.fit(combined, xomDataFilled)
xomNasdaqOilModel.score(combined, xomDataFilled)

import statsmodels.api as sm

X = xomDataFilled.reshape(1, -1)[0]
y = nasdaqDataFilled.reshape(1, -1)[0]

model = sm.OLS(xomDataFilled, X)
results = model.fit()
print(results.summary())

googData = readFile(googFile)
nasdaqData = readFile(nasdaqFile)

googData["Months"] = [int(x[5:7]) for x in googData["Date"]]

dummy = sm.categorical(googData["Months"].reshape(1, -1), drop=True)

#dummy = dummy[:,1:]

xData = np.hstack(
    (dummy, nasdaqData["Returns"].reshape(len(nasdaqData["Returns"]),
                                          -1)))[:-1]
yData = googData["Returns"][:-1].reshape(-1, 1)

model = sm.OLS(yData, xData)
results = model.fit()
print(results.summary())