def varimax_rotation(self, data): data = data.astype(float) ''' 该方法对因子载荷矩阵进行最大方差正交矩阵,返回旋转后的因子载荷矩阵 ''' zaihe = self.load(data) m, n = zaihe.shape R = np.eye(n) d = 0 for i in range(self.q): d_init = d Lambda = np.dot(zaihe, R) w, a, wa = np.linalg.svd( np.dot( zaihe.T, np.asarray(Lambda)**3 - (self.gamma / m) * np.dot( Lambda, np.diag(np.diag(np.dot(Lambda.T, Lambda)))))) R = np.dot(w, wa) d = np.sum(a) if d_init != 0 and d / d_init < 1 + self.tol: break orthogonal = np.dot(zaihe, R) self.orthogonal = orthogonal after = pd.DataFrame(orthogonal, index=data.columns, columns=self.col) after = format_data_col(after) col = after.columns.values.tolist() row = after.index.values.tolist() res = after.values.tolist() return transform_table_data_to_html({ 'title': "旋转后因子载荷", 'col': col, 'row': row, 'data': res })
def loadings(self, data): data = data.astype(float) ''' 该方法用于输出旋转前的因子载荷矩阵 ''' factor_num = self.component # 接下来求解因子载荷矩阵 # 生成由前factor_num个特征值构成的对角阵,存入duijiao中用于计算因子载荷矩阵 eigvalue = self.var_contri(data)['Eigvalue'] ## duijiao = list(np.array(np.sqrt(eigvalue[:factor_num]), dtype=float)) eigmat = np.diag(duijiao) zaihe = np.dot(self.eigvector[:factor_num].T, eigmat) self.zaihe = zaihe n = range(1, factor_num + 1) col = [] for i in n: c = 'Factor' + str(i) col.append(c) zaihe = -pd.DataFrame(zaihe, columns=col) zaihe.iloc[:, 1] = -zaihe.iloc[:, 1] self.col = col zaihe.index = data.columns self.zaihe = zaihe self.zaihe = format_data_col(self.zaihe) col = self.zaihe.columns.values.tolist() row = self.zaihe.index.values.tolist() res = self.zaihe.values.tolist() return transform_table_data_to_html({ 'title': "旋转前因子载荷", 'col': col, 'row': row, 'data': res })
def score(self, data): data = data.astype(float) ''' 该方法用于计算因子得分 ''' if self.standardize == True: data_scale = FA.standardization(self, data) F = np.dot(data_scale, self.coefficient.T) F = pd.DataFrame(F) col2 = [] n = range(1, self.component + 1) for i in n: c = 'ScoreF' + str(i) col2.append(c) F.columns = col2 F = format_data_col(F) col = F.columns.values.tolist() row = F.index.values.tolist() res = F.values.tolist() return transform_table_data_to_html({ 'title': "因子得分", 'col': col, 'row': row, 'data': res }) elif self.standardize == False: data_scale = data F = np.dot(data_scale, self.coefficient.T) F = pd.DataFrame(F) col2 = [] n = range(1, self.component + 1) for i in n: c = 'ScoreF' + str(i) col2.append(c) F.columns = col2 F = format_data_col(F) col = F.columns.values.tolist() row = F.index.values.tolist() res = F.values.tolist() return transform_table_data_to_html({ 'title': "因子得分", 'col': col, 'row': row, 'data': res })
def correlation_matrix(x): x = x.astype(float) da = format_data_col(x.corr()) col = da.columns.values.tolist() row = da.index.values.tolist() res = da.values.tolist() return transform_table_data_to_html({ 'title': "相关性矩阵", 'col': col, 'row': row, 'data': res })
def score_coef(self, data): data = data.astype(float) ''' 该方法用于计算因子得分函数 ''' # R 为原始变量的相关矩阵 corr = np.corrcoef(data, rowvar=0) A = self.varimax_rota(data) coefficient = pd.DataFrame(np.dot(np.array(A).T, np.mat(corr).T), columns=data.columns, index=self.col) self.coefficient = coefficient defen = coefficient.T defen = format_data_col(defen) col = defen.columns.values.tolist() row = defen.index.values.tolist() res = defen.values.tolist() return transform_table_data_to_html({ 'title': "因子得分系数矩阵", 'col': col, 'row': row, 'data': res })
def cross_chi2(index, columns): chi_res = [] cross_result = pd.crosstab(index=index, columns=columns, margins=True) cr_re = pd.crosstab(index=index, columns=columns, margins=False) # 给模型的不能有汇总列,8/25修改 chi2_pearson, p_value_pearson, dof_pearson, expect_pearson = chi2_contingency( cr_re, correction=True, lambda_='pearson') # pearson 卡方 chi2_log, p_value_log, dof_log, expect_log = chi2_contingency( cr_re, correction=True, lambda_='log-likelihood') chi2_ftukey, p_value_ftukey, dof_ftukey, expect_ftukey = chi2_contingency( cr_re, correction=True, lambda_='freeman-tukey') chi2_mll, p_value_mll, dof_mll, expect_mll = chi2_contingency( cr_re, correction=True, lambda_='mod-log-likelihood') chi2_neyman, p_value_neyman, dof_neyman, expect_neyman = chi2_contingency( cr_re, correction=True, lambda_='neyman') chi2_cr, p_value_cr, dof_cr, expect_cr = chi2_contingency( cr_re, correction=True, lambda_='cressie-read') chi_res.append([ "{:.4f}".format(chi2_pearson), "{:.4f}".format(p_value_pearson), dof_pearson ]) chi_res.append( ["{:.4f}".format(chi2_log), "{:.4f}".format(p_value_log), dof_log]) chi_res.append([ "{:.4f}".format(chi2_ftukey), "{:.4f}".format(p_value_ftukey), dof_ftukey ]) chi_res.append( ["{:.4f}".format(chi2_mll), "{:.4f}".format(p_value_mll), dof_mll]) chi_res.append([ "{:.4f}".format(chi2_neyman), "{:.4f}".format(p_value_neyman), dof_neyman ]) chi_res.append( ["{:.4f}".format(chi2_cr), "{:.4f}".format(p_value_cr), dof_cr]) corss_index = cross_result.index.tolist() corss_index[-1] = '总计' corss_columns = cross_result.columns.tolist() corss_columns[-1] = '总计' corss_value = cross_result.values.tolist() exp = pd.DataFrame(expected_freq(cr_re)) exp = sum_data(exp) expect = format_data_col(exp).values.tolist() r1 = { 'title': "交叉表", 'row': corss_index, 'col': corss_columns[0:], 'data': corss_value } r1 = transform_table_data_to_html(r1) r2 = { 'title': "期望频数表", 'row': corss_index, 'col': corss_columns, 'data': expect } r2 = transform_table_data_to_html(r2) r3 = { 'title': "卡方检验", 'row': [ "pearson", "log-likelihood", "freeman-tukey", "mod-log-likelihood", "neyman", "cressie-read" ], 'col': ['值', '显著性', '自由度'], 'data': chi_res } r3 = transform_table_data_to_html(r3) return [r1, r2, r3]
def PCA(x, components=None ): # x 是接收的只包含特征变量的dataframe,components=None 接收的用户指定的主成分个数 x = x.astype(float) result = [] if components == None: components = int(x.size / len(x)) # 这里再考虑一下,接收用户指定的几个主成分 ## 标准化 average = np.mean(x, axis=0) sigma = np.std(x, axis=0, ddof=1) r, c = np.shape(x) data_standardized = [] mu = np.tile( average, (r, 1)) # r 行,铺一遍 https://www.cnblogs.com/elitphil/p/11824539.html data_standardized = (x - mu) / sigma ## 标准化 cov_matrix = np.cov(data_standardized.T) # 协方差矩阵 EigenValue, EigenVector = np.linalg.eig(cov_matrix) # 特征值和特征向量 index = np.argsort(-EigenValue) # 从大到小排序,返回的是元素在原有数据中的位置序号 # Score = [] selected_Vector = EigenVector.T[ index[:components]] # 根据指定的主成分个数,选择特征值相对应的特征向量 Score = np.dot(data_standardized, selected_Vector.T) # 计算主成分得分 EigenValue_sorted = EigenValue[index] # 排序后的特征值 ''' 特征值贡献及贡献率,需输出一个表 ''' EigenValue_contribution = pd.DataFrame(EigenValue_sorted, columns=['EigenValue']) EigenValue_contribution['Proportion'] = EigenValue_contribution[ 'EigenValue'] / EigenValue_contribution['EigenValue'].sum() EigenValue_contribution['Cumulative'] = EigenValue_contribution[ 'Proportion'].cumsum() ''' 碎石图和带有方差贡献率的碎石图,此图需输出 ''' fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(6.8, 3) fig.subplots_adjust(wspace=0.5) # 改一下大小 ax1.plot(range(1, len(EigenValue_contribution) + 1), EigenValue_contribution['EigenValue'], 'o-') ax1.set_title('Scree Plot') ax1.set_xlabel('Principal Components') ax1.set_ylabel('Eigenvalue') ax1.grid() ax2.plot(range(1, len(EigenValue_contribution) + 1), EigenValue_contribution['Proportion'], 'o-') ax2.plot(range(1, len(EigenValue_contribution) + 1), EigenValue_contribution['Cumulative'], 'bo-.') ax2.set_title('Variance Explained') ax2.set_xlabel('Principal Components') ax2.set_ylabel('Proportion') ax2.grid() plt.show() ''' 对应的特征向量 ''' vector_index = ['prin%d' % (i + 1) for i in range(len(selected_Vector))] vector_columns = x.columns.values.tolist() principal_vector = pd.DataFrame(selected_Vector, index=vector_index, columns=vector_columns).T ''' 主成分载荷(成分矩阵),需输出一个表 ''' principal_component_load = pd.DataFrame() for i in range(len(selected_Vector)): principal_component_load['z%d' % (i + 1)] = np.sqrt( EigenValue_contribution['EigenValue'][i]) * principal_vector[ 'prin%d' % (i + 1)] ''' 主成分得分(成分得分系数矩阵) ''' principal_scores = pd.DataFrame() for i in range(len(selected_Vector)): principal_scores['prin%d_score' % (i + 1)] = Score[:, i] EigenValue_sorted_selected = EigenValue_sorted[:len(selected_Vector)] chengji = EigenValue_sorted_selected * principal_scores principal_scores['scores'] = chengji.sum(axis=1) principal_scores = principal_scores.sort_values(by='scores', ascending=False) Eig_contri = EigenValue_contribution Eig_contri['EigenValue'] = Eig_contri['EigenValue'].apply( lambda x: format(x, '.4f')) Eig_contri['Proportion'] = Eig_contri['Proportion'].apply( lambda x: format(x, '.2%')) Eig_contri['Cumulative'] = Eig_contri['Cumulative'].apply( lambda x: format(x, '.2%')) result.append({ 'title': "总方差解释", 'col': ['特征值', '特征值方差贡献率', '累计方差贡献率'], 'data': Eig_contri.values.tolist() }) result.append({ "title": "碎石图", "base64": "{}".format(plot_and_output_base64_png(plt)) }) prin_com_load = format_data_col(principal_component_load) col = prin_com_load.columns.values.tolist() row = prin_com_load.index.values.tolist() res = prin_com_load.values.tolist() result.append( transform_table_data_to_html({ 'title': "主成分载荷", 'col': col, 'row': row, 'data': res })) prin_scores = format_data_col(principal_scores) col = prin_scores.columns.values.tolist() row = prin_scores.index.values.tolist() res = prin_scores.values.tolist() result.append( transform_table_data_to_html({ 'title': "主成分得分系数矩阵", 'col': col, 'row': row, 'data': res })) return result
def result_one_sample_chi(): """ 接口请求参数:{ "table_name": "" # str,数据库表名 "X": ["x1", "x2"], # list,检测变量 "E": ["e1","e2"], # list,期望频率变量 "input_e": [2,3,4], #用户具体输入的期望频率 "button_type": ["select","input","null"] #str 按钮的类型 } :return: """ log.info('result_one_sample_chi_get_results_init...') request_data = init_route() try: table_name = request_data['table_name'] X = request_data['X'] E = request_data['E'] input_e = request_data['input_e'] ############################# button_type = request_data['button_type'] except Exception as e: log.info(e) raise e assert isinstance([X], list) results = [] try: if button_type[0] == 'null': da = exec_sql(table_name, X) da = da.astype(float) data = [da[i] for i in X] log.info("输入数据大小:{}".format(len(data))) if da.shape[1] == 1: statistic, pvalue = stats.power_divergence(da[X[0]], axis=0) title = '单样本卡方检验' col = ['卡方', '显著性'] row = X d = pd.DataFrame([statistic, pvalue]).T d = d.astype(float) d = format_data_col(d) res = d.values.tolist() results = transform_table_data_to_html({ 'title': title, 'col': col, 'row': row, 'data': res }) elif da.shape[1] > 1: statistic, pvalue = stats.power_divergence(data, axis=1) title = '单样本卡方检验' col = ['卡方', '显著性'] row = X d = pd.DataFrame([statistic, pvalue]).T d = d.astype(float) d = format_data_col(d) res = d.values.tolist() results = transform_table_data_to_html({ 'title': title, 'col': col, 'row': row, 'data': res }) log.info("无期望频率情况分析完成") elif button_type[0] == 'select': te = exec_sql(table_name, X) te = te.astype(float) test = [te[i] for i in X] ex = exec_sql(table_name, E) ex = ex.astype(float) expect = [ex[j] for j in E] log.info("输入数据大小:{}".format(len(test))) if te.shape[1] == 1: statistic, pvalue = stats.power_divergence(test,expect, axis=0) title = '单样本卡方检验' col = ['卡方', '显著性'] row = X d = pd.DataFrame([statistic, pvalue]).T d = d.astype(float) d = format_data_col(d) res = d.values.tolist() results = transform_table_data_to_html({ 'title': title, 'col': col, 'row': row, 'data': res }) elif te.shape[1] > 1: statistic, pvalue = stats.power_divergence(test,expect, axis=1) title = '单样本卡方检验' col = ['卡方', '显著性'] row = X d = pd.DataFrame([statistic, pvalue]).T d = d.astype(float) d = format_data_col(d) res = d.values.tolist() results = transform_table_data_to_html({ 'title': title, 'col': col, 'row': row, 'data': res }) log.info("有期望频率情况分析完成") elif button_type[0] == 'input': te = exec_sql(table_name, X) te = te.astype(float) test = [te[i] for i in X] expect = input_e expect = pd.DataFrame(expect) expect = expect.astype(float) expect = expect.values.tolist() log.info("输入数据大小:{}".format(len(test))) if te.shape[1] == 1: statistic, pvalue = stats.power_divergence(test,expect, axis=0) title = '单样本卡方检验' col = ['卡方', '显著性'] row = X d = pd.DataFrame([statistic, pvalue]).T d = d.astype(float) d = format_data_col(d) res = d.values.tolist() results = transform_table_data_to_html({ 'title': title, 'col': col, 'row': row, 'data': res }) elif te.shape[1] > 1: statistic, pvalue = stats.power_divergence(test,expect, axis=1) title = '单样本卡方检验' col = ['卡方', '显著性'] row = X d = pd.DataFrame([statistic, pvalue]).T d = d.astype(float) d = format_data_col(d) res = d.values.tolist() results = transform_table_data_to_html({ 'title': title, 'col': col, 'row': row, 'data': res }) log.info("用户输入的期望频率情况分析完成") response_data = { "code": "200", "msg": "ok!", "res":results} return jsonify(response_data) except Exception as e: log.error(e) raise e
def cross_chis(index, columns, fenceng): chi_res = [] expect = [] # 多层交叉表 cross_result = pd.crosstab(index=index, columns=columns, margins=True) corss_index = cross_result.index.tolist() corss_index[-1] = '总计' corss_columns = cross_result.columns.tolist() corss_columns[-1] = '总计' corss_value = cross_result.values.tolist() # 交叉表分析 cr_re = pd.crosstab(index=index, columns=columns, margins=False) # 给模型的不能有汇总列 first_index = np.unique(index[0]) for i in first_index: chis_pearson, p_value_pearson, dof_pearson, expect_pearson = chi2_contingency( cr_re.loc[i, :], correction=True, lambda_='pearson') chis_log, p_value_log, dof_log, expect_log = chi2_contingency( cr_re.loc[i, :], correction=True, lambda_='log-likelihood') chis_ftukey, p_value_ftukey, dof_ftukey, expect_ftukey = chi2_contingency( cr_re.loc[i, :], correction=True, lambda_='freeman-tukey') chis_mll, p_value_mll, dof_mll, expect_mll = chi2_contingency( cr_re.loc[i, :], correction=True, lambda_='mod-log-likelihood') chis_neyman, p_value_neyman, dof_neyman, expect_neyman = chi2_contingency( cr_re.loc[i, :], correction=True, lambda_='neyman') chis_cr, p_value_cr, dof_cr, expect_cr = chi2_contingency( cr_re.loc[i, :], correction=True, lambda_='cressie-read') chi_res.append([ "{:.4f}".format(chis_pearson), "{:.4f}".format(p_value_pearson), dof_pearson ]) chi_res.append( ["{:.4f}".format(chis_log), "{:.4f}".format(p_value_log), dof_log]) chi_res.append([ "{:.4f}".format(chis_ftukey), "{:.4f}".format(p_value_ftukey), dof_ftukey ]) chi_res.append( ["{:.4f}".format(chis_mll), "{:.4f}".format(p_value_mll), dof_mll]) chi_res.append([ "{:.4f}".format(chis_neyman), "{:.4f}".format(p_value_neyman), dof_neyman ]) chi_res.append( ["{:.4f}".format(chis_cr), "{:.4f}".format(p_value_cr), dof_cr]) for j in expect_pearson: expect.append(j) # expect.extend(expect_pearson.tolist()) expect = pd.DataFrame(expect) #.astype(float) expect = sum_data(expect) expect = format_data_col(expect).values.tolist() # row = ["pearson","log-likelihood","freeman-tukey","mod-log-likelihood","neyman","cressie-read"]*len(first_index) row = [] method = [ "pearson", "log-likelihood", "freeman-tukey", "mod-log-likelihood", "neyman", "cressie-read" ] for uindex in first_index: for m in method: row.append(fenceng[0] + '_' + uindex + ':' + m) r1 = { 'title': "交叉表", 'row': ["/".join(["{}".format(d) for d in c]) for c in corss_index], 'col': corss_columns, 'data': corss_value } r1 = transform_table_data_to_html(r1) r2 = { 'title': "期望频数表", 'row': ["/".join(["{}".format(d) for d in c]) for c in corss_index], 'col': corss_columns[1:], 'data': expect } r2 = transform_table_data_to_html(r2) r3 = { 'title': "卡方检验", 'row': row, 'col': ['值', '显著性', '自由度'], 'data': chi_res } r3 = transform_table_data_to_html(r3) return [r1, r2, r3] # expect