def __init__(self, filename, year=None, encoding='GBK'):
        self.cgss_db = CgssDatabase()

        self.stata_object = Statadata(filename, encoding=encoding)
        self.stata_file = self.stata_object.stata_file

        self.year = int(year)

        # 变量-标签字典
        self.variables_labels_dict = dict(
            zip(self.stata_object.variables, self.stata_object.variables_labels))
        # 变量-值标签字典
        self.variables_values_labels_dict = dict(
            zip(self.stata_object.variables, self.stata_object.variables_values_labels))
        # 值标签字典
        self.values_labels_dict = self.values_labels
Exemple #2
0
    def adj_Rsquared(self):
        return self.robj.get_var('slm_obj$adj.r.squared')

    @property
    def cov(self):
        return self.robj.get_var('slm_obj$cov.unscaled')

    @property
    def qr(self):
        return self.robj.get_var('lm_obj$qr$qr')

if __name__ == '__main__':
    rthread = RThread()

    stata_file = r'D:\data\test\wage1.dta'
    stdata = Statadata(stata_file)
    mdata = stdata.read()
    rdata = mdata[['lwage','educ','exper','tenure']]
    rthread.create_vars(rdata)

    lm = Rreg(robj=rthread,xvar=['educ','exper','tenure'],yvar='lwage')
    '''
    print(lm.coefficients)
    print(lm.fstatistic)
    print(lm.sigma)
    print(lm.df)
    print(lm.qr)
    print(lm.Rsquared)
    print(lm.adj_Rsquared)
    print(lm.cov)
    print(lm.rank)
class CgssStataSheet:
    """类CgssStataSheet用来读写cgss.dta(stata格式)的数据文件

    :param str filename: 想要读写的文件名
    :param int year: 年份
    :param str encoding: 编码方式,默认是'GBK'
    :return: 无返回值
    :var list variables: 变量列表
    :var list variables_labels: 变量标签列表
    :var list variables_values_labels: 变量的值标签
    :var list values_labels: 值标签的内容
    :var dict variables_labels_dict: 变量-标签字典
    :var dict variables_values_labels_dict: 变量-值标签字典
    :var dict values_labels_dict: 值-标签字典
    """

    def __init__(self, filename, year=None, encoding='GBK'):
        self.cgss_db = CgssDatabase()

        self.stata_object = Statadata(filename, encoding=encoding)
        self.stata_file = self.stata_object.stata_file

        self.year = int(year)

        # 变量-标签字典
        self.variables_labels_dict = dict(
            zip(self.stata_object.variables, self.stata_object.variables_labels))
        # 变量-值标签字典
        self.variables_values_labels_dict = dict(
            zip(self.stata_object.variables, self.stata_object.variables_values_labels))
        # 值标签字典
        self.values_labels_dict = self.values_labels

    def insert(self):
        """insert方法用来插入数据到Mongodb数据库集合
        """
        # 读入stata文件数据
        data = self.stata_object.read()
        # 数据行数
        data_rows = data.shape[0]
        print(data_rows)

        # 读取每一行数据(产生一条记录)
        for i in range(data_rows):
            # 每一条记录用字典记录,存储在record变量中
            record = dict({'year': self.year})
            # 读取一条记录
            row_data = data.iloc[i]

            j = 0
            # 包装记录,即记录变量数据在字典record中
            # ind是行数据的索引(即变量)
            for ind in row_data.index:
                j = j + 1
                # 初始化值标签
                value_label = None
                # 变量-值标签变量
                variable_value_labels = self.variables_values_labels_dict[ind]
                # 值标签字典
                value_labels = self.values_labels_dict.get(
                    variable_value_labels, None)
                # 变量的值
                value = row_data[ind]

                # 如果变量的值是数值型
                if isinstance(value, (np.float32, np.float64, np.integer)):
                    if np.isnan(value):
                        value = None
                    else:
                        if value_labels is None:
                            value = None
                        else:
                            if np.equal(value, np.int(value)):
                                value = int(value)
                                value_label = value_labels.get(
                                    np.int(value), None)
                            else:
                                value = float(value)
                elif isinstance(value, str):
                    if re.match('^\s*$', value) is not None:
                        value = None
                elif isinstance(value, (pd.tslib.Timestamp, datetime)):
                    if isinstance(value, pd.tslib.NaTType):
                        value = None
                    else:
                        value = re.split(' ', str(value))[0]
                else:
                    print(ind)
                    print(value, type(value))
                    raise TypeError
                record[ind] = {'label': self.variables_labels_dict[ind], 'serial_number': j,
                               'value': {
                                   'value': value,
                                   'label': value_label
                               }}
            print(i)
            print('record', record)
            self.cgss_db.collection.insert(record)

    @property
    def values_labels(self):
        labels = self.stata_object.values_labels
        new_labels_dict = dict()
        for var in labels:
            var_label = list(labels[var].items())
            label_tuple = [(np.uint(item[0]).astype(int), item[1])
                           for item in var_label]
            new_labels_dict[var] = dict(label_tuple)
        return new_labels_dict