def corr(self,data_source='Raw',model='auto'): if re.match('^Raw$',data_source): variables = self.variables data = self.data else: variables = self.generator_variables data = self.generate_data for year in self.year: self.report.create_section(year) data_frame = data[year] cse = CrossSectionRegionDataExplorer(data_frame) corr = cse.corr() corr2 = corr.applymap(abs) corr_rank = corr2.rank(method='max',ascending=False) print(corr) print(corr.shape[0],len(variables)) print(corr_rank) top_number = min(len(variables)+1,11) for var in variables: self.report.create_subsection(var) #print(corr[var]) corr_data = {'variable':corr_rank.index,'rank':corr_rank[var],'corr':corr[var]} corr_data_frame = pd.DataFrame(corr_data,columns=['rank','variable','corr']) corr_data_frame = corr_data_frame.set_index('rank') corr_data_frame = corr_data_frame.sort_index() corr_data_frame = corr_data_frame.reindex(list(range(2,len(variables)+1))) corr_data_frame = corr_data_frame.set_index('variable') corr_data_frame = corr_data_frame.applymap(lambda x:'{0:.2f}'.format(x)) corr_data_frame = corr_data_frame.iloc[:top_number] table_data = self.dataframe_to_list(corr_data_frame) self.report.add_table(table_data['data'],table_data['nrow'],table_data['ncol']) for xvar in corr_data_frame.index: plt = cse.scatter(y=var,x=xvar,save=True) self.report.add_matplotlib_graph(plt,caption='图形') plt.close() print('-------------------------') print(corr_data_frame) print(self.dataframe_to_list(corr_data_frame)) print('-------------------------')
def variable_transformation(self): generator_data = dict() self.generator_variables = set() for year in list(self.data.items): assist_data_year = self.assist_data[year] del assist_data_year['region'] total_data = pd.merge(self.data[year], assist_data_year, left_index=True, right_index=True, how='outer') csdexplorer = CrossSectionRegionDataExplorer(total_data) cross_section_frame = pd.DataFrame({'region':self.data[year]['region']}) for var in self.data[year].columns[1:]: variable_name = var dframe = pd.DataFrame({variable_name:self.data[year][variable_name]}) if self.raw_variables[var] is not None: variable_name = '|'.join([var,self.raw_variables[var]]) dframe = csdexplorer.per_variable(pop=self.raw_variables[var],var=[var]) if (np.min(dframe[variable_name]) > 0) and (np.max(dframe[variable_name]) < 1): dframe = dframe.applymap(lambda x: 100 * x) cross_section_frame = pd.merge(cross_section_frame, dframe, left_index=True, right_index=True, how='outer') continue if (np.min(dframe[variable_name]) > 0) and (np.max(dframe[variable_name]) < 100): cross_section_frame = pd.merge(cross_section_frame, dframe, left_index=True, right_index=True, how='outer') continue if np.min(dframe[variable_name]) > 0: dframe = dframe.applymap(np.log) dframe.columns = ['|'.join([variable_name,u'对数'])] #print(dframe.columns) cross_section_frame = pd.merge(cross_section_frame, dframe, left_index=True, right_index=True, how='outer') generator_data[year] = cross_section_frame self.generator_variables.update(list(cross_section_frame.columns)) self.generate_data = pd.Panel(generator_data) for y in self.year: self.generate_data[y][list(self.generate_data[y].columns)[1:]] = self.generate_data[y][list(self.generate_data[y].columns)[1:]].astype(np.float64) self.generator_variables = list(self.generator_variables) self.generator_variables.remove('region')
def describe(self,data_source='Raw'): if re.match('^Raw$',data_source): variables = self.variables data = self.data else: variables = self.generator_variables data = self.generate_data for var in variables: # 生成章节 self.report.create_section(var) for year in self.year: # 生成分支章节 self.report.create_subsection(year) #data_frame = pd.DataFrame({var:data[year][var]}) data_frame = data[year][['region',var]] cse = CrossSectionRegionDataExplorer(data_frame) #print(cse.describe().applymap(lambda x:'{0:.2f}'.format(x))) tdata = cse.describe().applymap(lambda x:'{0:.2f}'.format(x)) tdata = self.dataframe_to_list(tdata) print(var,year) self.report.add_table(tdata['data'],tdata['nrow'],tdata['ncol'])