Ejemplo n.º 1
0
    def corr(self,data_source='Raw',model='auto'):
        if re.match('^Raw$',data_source):
            variables = self.variables
            data = self.data
        else:
            variables = self.generator_variables
            data = self.generate_data

        for year in self.year:
            self.report.create_section(year)
            data_frame = data[year]
            cse = CrossSectionRegionDataExplorer(data_frame)
            corr = cse.corr()
            corr2 = corr.applymap(abs)
            corr_rank = corr2.rank(method='max',ascending=False)

            print(corr)
            print(corr.shape[0],len(variables))
            print(corr_rank)
            top_number = min(len(variables)+1,11)
            for var in variables:
                self.report.create_subsection(var)
                #print(corr[var])
                corr_data = {'variable':corr_rank.index,'rank':corr_rank[var],'corr':corr[var]}
                corr_data_frame = pd.DataFrame(corr_data,columns=['rank','variable','corr'])
                corr_data_frame = corr_data_frame.set_index('rank')
                corr_data_frame = corr_data_frame.sort_index()

                corr_data_frame = corr_data_frame.reindex(list(range(2,len(variables)+1)))

                corr_data_frame = corr_data_frame.set_index('variable')
                corr_data_frame = corr_data_frame.applymap(lambda x:'{0:.2f}'.format(x))
                corr_data_frame = corr_data_frame.iloc[:top_number]

                table_data = self.dataframe_to_list(corr_data_frame)
                self.report.add_table(table_data['data'],table_data['nrow'],table_data['ncol'])
                for xvar in corr_data_frame.index:
                    plt = cse.scatter(y=var,x=xvar,save=True)
                    self.report.add_matplotlib_graph(plt,caption='图形')
                    plt.close()

                print('-------------------------')
                print(corr_data_frame)
                print(self.dataframe_to_list(corr_data_frame))
                print('-------------------------')
Ejemplo n.º 2
0
    def variable_transformation(self):
        generator_data = dict()
        self.generator_variables = set()
        for year in list(self.data.items):
            assist_data_year = self.assist_data[year]
            del assist_data_year['region']
            total_data = pd.merge(self.data[year], assist_data_year, left_index=True, right_index=True, how='outer')

            csdexplorer = CrossSectionRegionDataExplorer(total_data)
            cross_section_frame = pd.DataFrame({'region':self.data[year]['region']})
            for var in self.data[year].columns[1:]:
                variable_name = var
                dframe = pd.DataFrame({variable_name:self.data[year][variable_name]})

                if self.raw_variables[var] is not None:
                    variable_name = '|'.join([var,self.raw_variables[var]])
                    dframe = csdexplorer.per_variable(pop=self.raw_variables[var],var=[var])

                if (np.min(dframe[variable_name]) > 0) and (np.max(dframe[variable_name]) < 1):
                    dframe = dframe.applymap(lambda x: 100 * x)
                    cross_section_frame = pd.merge(cross_section_frame, dframe, left_index=True, right_index=True, how='outer')
                    continue

                if (np.min(dframe[variable_name]) > 0) and (np.max(dframe[variable_name]) < 100):
                    cross_section_frame = pd.merge(cross_section_frame, dframe, left_index=True, right_index=True, how='outer')
                    continue

                if np.min(dframe[variable_name]) > 0:
                    dframe = dframe.applymap(np.log)
                    dframe.columns = ['|'.join([variable_name,u'对数'])]
                    #print(dframe.columns)
                    cross_section_frame = pd.merge(cross_section_frame, dframe, left_index=True, right_index=True, how='outer')

            generator_data[year] = cross_section_frame
            self.generator_variables.update(list(cross_section_frame.columns))
        self.generate_data = pd.Panel(generator_data)
        for y in self.year:
            self.generate_data[y][list(self.generate_data[y].columns)[1:]] = self.generate_data[y][list(self.generate_data[y].columns)[1:]].astype(np.float64)

        self.generator_variables = list(self.generator_variables)
        self.generator_variables.remove('region')
Ejemplo n.º 3
0
    def describe(self,data_source='Raw'):
        if re.match('^Raw$',data_source):
            variables = self.variables
            data = self.data
        else:
            variables = self.generator_variables
            data = self.generate_data

        for var in variables:
            # 生成章节
            self.report.create_section(var)

            for year in self.year:
                # 生成分支章节
                self.report.create_subsection(year)
                #data_frame = pd.DataFrame({var:data[year][var]})
                data_frame = data[year][['region',var]]

                cse = CrossSectionRegionDataExplorer(data_frame)
                #print(cse.describe().applymap(lambda x:'{0:.2f}'.format(x)))
                tdata = cse.describe().applymap(lambda x:'{0:.2f}'.format(x))
                tdata = self.dataframe_to_list(tdata)
                print(var,year)
                self.report.add_table(tdata['data'],tdata['nrow'],tdata['ncol'])