Ejemplo n.º 1
0
 def varimax_rotation(self, data):
     data = data.astype(float)
     '''
     该方法对因子载荷矩阵进行最大方差正交矩阵,返回旋转后的因子载荷矩阵
     '''
     zaihe = self.load(data)
     m, n = zaihe.shape
     R = np.eye(n)
     d = 0
     for i in range(self.q):
         d_init = d
         Lambda = np.dot(zaihe, R)
         w, a, wa = np.linalg.svd(
             np.dot(
                 zaihe.T,
                 np.asarray(Lambda)**3 - (self.gamma / m) * np.dot(
                     Lambda, np.diag(np.diag(np.dot(Lambda.T, Lambda))))))
         R = np.dot(w, wa)
         d = np.sum(a)
         if d_init != 0 and d / d_init < 1 + self.tol:
             break
     orthogonal = np.dot(zaihe, R)
     self.orthogonal = orthogonal
     after = pd.DataFrame(orthogonal, index=data.columns, columns=self.col)
     after = format_data_col(after)
     col = after.columns.values.tolist()
     row = after.index.values.tolist()
     res = after.values.tolist()
     return transform_table_data_to_html({
         'title': "旋转后因子载荷",
         'col': col,
         'row': row,
         'data': res
     })
Ejemplo n.º 2
0
 def loadings(self, data):
     data = data.astype(float)
     '''
     该方法用于输出旋转前的因子载荷矩阵
     '''
     factor_num = self.component
     # 接下来求解因子载荷矩阵
     # 生成由前factor_num个特征值构成的对角阵,存入duijiao中用于计算因子载荷矩阵
     eigvalue = self.var_contri(data)['Eigvalue']  ##
     duijiao = list(np.array(np.sqrt(eigvalue[:factor_num]), dtype=float))
     eigmat = np.diag(duijiao)
     zaihe = np.dot(self.eigvector[:factor_num].T, eigmat)
     self.zaihe = zaihe
     n = range(1, factor_num + 1)
     col = []
     for i in n:
         c = 'Factor' + str(i)
         col.append(c)
     zaihe = -pd.DataFrame(zaihe, columns=col)
     zaihe.iloc[:, 1] = -zaihe.iloc[:, 1]
     self.col = col
     zaihe.index = data.columns
     self.zaihe = zaihe
     self.zaihe = format_data_col(self.zaihe)
     col = self.zaihe.columns.values.tolist()
     row = self.zaihe.index.values.tolist()
     res = self.zaihe.values.tolist()
     return transform_table_data_to_html({
         'title': "旋转前因子载荷",
         'col': col,
         'row': row,
         'data': res
     })
Ejemplo n.º 3
0
 def score(self, data):
     data = data.astype(float)
     '''
     该方法用于计算因子得分
     '''
     if self.standardize == True:
         data_scale = FA.standardization(self, data)
         F = np.dot(data_scale, self.coefficient.T)
         F = pd.DataFrame(F)
         col2 = []
         n = range(1, self.component + 1)
         for i in n:
             c = 'ScoreF' + str(i)
             col2.append(c)
         F.columns = col2
         F = format_data_col(F)
         col = F.columns.values.tolist()
         row = F.index.values.tolist()
         res = F.values.tolist()
         return transform_table_data_to_html({
             'title': "因子得分",
             'col': col,
             'row': row,
             'data': res
         })
     elif self.standardize == False:
         data_scale = data
         F = np.dot(data_scale, self.coefficient.T)
         F = pd.DataFrame(F)
         col2 = []
         n = range(1, self.component + 1)
         for i in n:
             c = 'ScoreF' + str(i)
             col2.append(c)
         F.columns = col2
         F = format_data_col(F)
         col = F.columns.values.tolist()
         row = F.index.values.tolist()
         res = F.values.tolist()
         return transform_table_data_to_html({
             'title': "因子得分",
             'col': col,
             'row': row,
             'data': res
         })
def correlation_matrix(x):
    x = x.astype(float)
    da = format_data_col(x.corr())
    col = da.columns.values.tolist()
    row = da.index.values.tolist()
    res = da.values.tolist()
    return transform_table_data_to_html({
        'title': "相关性矩阵",
        'col': col,
        'row': row,
        'data': res
    })
Ejemplo n.º 5
0
 def score_coef(self, data):
     data = data.astype(float)
     '''
     该方法用于计算因子得分函数
     '''
     # R 为原始变量的相关矩阵
     corr = np.corrcoef(data, rowvar=0)
     A = self.varimax_rota(data)
     coefficient = pd.DataFrame(np.dot(np.array(A).T,
                                       np.mat(corr).T),
                                columns=data.columns,
                                index=self.col)
     self.coefficient = coefficient
     defen = coefficient.T
     defen = format_data_col(defen)
     col = defen.columns.values.tolist()
     row = defen.index.values.tolist()
     res = defen.values.tolist()
     return transform_table_data_to_html({
         'title': "因子得分系数矩阵",
         'col': col,
         'row': row,
         'data': res
     })
def cross_chi2(index, columns):
    chi_res = []
    cross_result = pd.crosstab(index=index, columns=columns, margins=True)
    cr_re = pd.crosstab(index=index, columns=columns,
                        margins=False)  # 给模型的不能有汇总列,8/25修改
    chi2_pearson, p_value_pearson, dof_pearson, expect_pearson = chi2_contingency(
        cr_re, correction=True, lambda_='pearson')  # pearson 卡方
    chi2_log, p_value_log, dof_log, expect_log = chi2_contingency(
        cr_re, correction=True, lambda_='log-likelihood')
    chi2_ftukey, p_value_ftukey, dof_ftukey, expect_ftukey = chi2_contingency(
        cr_re, correction=True, lambda_='freeman-tukey')
    chi2_mll, p_value_mll, dof_mll, expect_mll = chi2_contingency(
        cr_re, correction=True, lambda_='mod-log-likelihood')
    chi2_neyman, p_value_neyman, dof_neyman, expect_neyman = chi2_contingency(
        cr_re, correction=True, lambda_='neyman')
    chi2_cr, p_value_cr, dof_cr, expect_cr = chi2_contingency(
        cr_re, correction=True, lambda_='cressie-read')

    chi_res.append([
        "{:.4f}".format(chi2_pearson), "{:.4f}".format(p_value_pearson),
        dof_pearson
    ])
    chi_res.append(
        ["{:.4f}".format(chi2_log), "{:.4f}".format(p_value_log), dof_log])
    chi_res.append([
        "{:.4f}".format(chi2_ftukey), "{:.4f}".format(p_value_ftukey),
        dof_ftukey
    ])
    chi_res.append(
        ["{:.4f}".format(chi2_mll), "{:.4f}".format(p_value_mll), dof_mll])
    chi_res.append([
        "{:.4f}".format(chi2_neyman), "{:.4f}".format(p_value_neyman),
        dof_neyman
    ])
    chi_res.append(
        ["{:.4f}".format(chi2_cr), "{:.4f}".format(p_value_cr), dof_cr])

    corss_index = cross_result.index.tolist()
    corss_index[-1] = '总计'
    corss_columns = cross_result.columns.tolist()
    corss_columns[-1] = '总计'

    corss_value = cross_result.values.tolist()
    exp = pd.DataFrame(expected_freq(cr_re))
    exp = sum_data(exp)
    expect = format_data_col(exp).values.tolist()

    r1 = {
        'title': "交叉表",
        'row': corss_index,
        'col': corss_columns[0:],
        'data': corss_value
    }
    r1 = transform_table_data_to_html(r1)

    r2 = {
        'title': "期望频数表",
        'row': corss_index,
        'col': corss_columns,
        'data': expect
    }
    r2 = transform_table_data_to_html(r2)
    r3 = {
        'title':
        "卡方检验",
        'row': [
            "pearson", "log-likelihood", "freeman-tukey", "mod-log-likelihood",
            "neyman", "cressie-read"
        ],
        'col': ['值', '显著性', '自由度'],
        'data':
        chi_res
    }
    r3 = transform_table_data_to_html(r3)
    return [r1, r2, r3]
def PCA(x,
        components=None
        ):  # x 是接收的只包含特征变量的dataframe,components=None 接收的用户指定的主成分个数
    x = x.astype(float)
    result = []
    if components == None:
        components = int(x.size / len(x))  # 这里再考虑一下,接收用户指定的几个主成分
    ## 标准化
    average = np.mean(x, axis=0)
    sigma = np.std(x, axis=0, ddof=1)
    r, c = np.shape(x)
    data_standardized = []
    mu = np.tile(
        average,
        (r, 1))  # r 行,铺一遍 https://www.cnblogs.com/elitphil/p/11824539.html
    data_standardized = (x - mu) / sigma
    ## 标准化

    cov_matrix = np.cov(data_standardized.T)  # 协方差矩阵
    EigenValue, EigenVector = np.linalg.eig(cov_matrix)  # 特征值和特征向量

    index = np.argsort(-EigenValue)  # 从大到小排序,返回的是元素在原有数据中的位置序号
    # Score = []
    selected_Vector = EigenVector.T[
        index[:components]]  # 根据指定的主成分个数,选择特征值相对应的特征向量
    Score = np.dot(data_standardized, selected_Vector.T)  # 计算主成分得分
    EigenValue_sorted = EigenValue[index]  # 排序后的特征值
    '''
    特征值贡献及贡献率,需输出一个表
    '''
    EigenValue_contribution = pd.DataFrame(EigenValue_sorted,
                                           columns=['EigenValue'])
    EigenValue_contribution['Proportion'] = EigenValue_contribution[
        'EigenValue'] / EigenValue_contribution['EigenValue'].sum()
    EigenValue_contribution['Cumulative'] = EigenValue_contribution[
        'Proportion'].cumsum()
    '''
    碎石图和带有方差贡献率的碎石图,此图需输出
    '''

    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(6.8, 3)
    fig.subplots_adjust(wspace=0.5)  # 改一下大小

    ax1.plot(range(1,
                   len(EigenValue_contribution) + 1),
             EigenValue_contribution['EigenValue'], 'o-')
    ax1.set_title('Scree Plot')
    ax1.set_xlabel('Principal Components')
    ax1.set_ylabel('Eigenvalue')
    ax1.grid()

    ax2.plot(range(1,
                   len(EigenValue_contribution) + 1),
             EigenValue_contribution['Proportion'], 'o-')
    ax2.plot(range(1,
                   len(EigenValue_contribution) + 1),
             EigenValue_contribution['Cumulative'], 'bo-.')
    ax2.set_title('Variance Explained')
    ax2.set_xlabel('Principal Components')
    ax2.set_ylabel('Proportion')
    ax2.grid()
    plt.show()
    '''
    对应的特征向量
    '''
    vector_index = ['prin%d' % (i + 1) for i in range(len(selected_Vector))]
    vector_columns = x.columns.values.tolist()
    principal_vector = pd.DataFrame(selected_Vector,
                                    index=vector_index,
                                    columns=vector_columns).T
    '''
    主成分载荷(成分矩阵),需输出一个表
    '''
    principal_component_load = pd.DataFrame()
    for i in range(len(selected_Vector)):
        principal_component_load['z%d' % (i + 1)] = np.sqrt(
            EigenValue_contribution['EigenValue'][i]) * principal_vector[
                'prin%d' % (i + 1)]
    '''
    主成分得分(成分得分系数矩阵)
    '''
    principal_scores = pd.DataFrame()
    for i in range(len(selected_Vector)):
        principal_scores['prin%d_score' % (i + 1)] = Score[:, i]
    EigenValue_sorted_selected = EigenValue_sorted[:len(selected_Vector)]
    chengji = EigenValue_sorted_selected * principal_scores
    principal_scores['scores'] = chengji.sum(axis=1)
    principal_scores = principal_scores.sort_values(by='scores',
                                                    ascending=False)

    Eig_contri = EigenValue_contribution
    Eig_contri['EigenValue'] = Eig_contri['EigenValue'].apply(
        lambda x: format(x, '.4f'))
    Eig_contri['Proportion'] = Eig_contri['Proportion'].apply(
        lambda x: format(x, '.2%'))
    Eig_contri['Cumulative'] = Eig_contri['Cumulative'].apply(
        lambda x: format(x, '.2%'))
    result.append({
        'title': "总方差解释",
        'col': ['特征值', '特征值方差贡献率', '累计方差贡献率'],
        'data': Eig_contri.values.tolist()
    })
    result.append({
        "title": "碎石图",
        "base64": "{}".format(plot_and_output_base64_png(plt))
    })
    prin_com_load = format_data_col(principal_component_load)
    col = prin_com_load.columns.values.tolist()
    row = prin_com_load.index.values.tolist()
    res = prin_com_load.values.tolist()
    result.append(
        transform_table_data_to_html({
            'title': "主成分载荷",
            'col': col,
            'row': row,
            'data': res
        }))

    prin_scores = format_data_col(principal_scores)
    col = prin_scores.columns.values.tolist()
    row = prin_scores.index.values.tolist()
    res = prin_scores.values.tolist()
    result.append(
        transform_table_data_to_html({
            'title': "主成分得分系数矩阵",
            'col': col,
            'row': row,
            'data': res
        }))

    return result
Ejemplo n.º 8
0
def result_one_sample_chi():
    """
    接口请求参数:{
        "table_name": "" # str,数据库表名
        "X": ["x1", "x2"], # list,检测变量
        "E": ["e1","e2"], # list,期望频率变量
        "input_e": [2,3,4], #用户具体输入的期望频率
        "button_type": ["select","input","null"] #str 按钮的类型
    }
    :return:
    """
    log.info('result_one_sample_chi_get_results_init...')
    request_data = init_route()
    try:
        table_name = request_data['table_name']
        X = request_data['X']
        E = request_data['E']
        input_e = request_data['input_e'] #############################
        button_type = request_data['button_type']

    except Exception as e:
        log.info(e)
        raise e
    assert isinstance([X], list)
    results = []
    try:
        if button_type[0] == 'null':
            da = exec_sql(table_name, X)
            da = da.astype(float)
            data = [da[i] for i in X]
            log.info("输入数据大小:{}".format(len(data)))
            if da.shape[1] == 1:
                statistic, pvalue = stats.power_divergence(da[X[0]], axis=0)
                title = '单样本卡方检验'
                col = ['卡方', '显著性']
                row = X
                d = pd.DataFrame([statistic, pvalue]).T
                d = d.astype(float)
                d = format_data_col(d)
                res = d.values.tolist()
                results = transform_table_data_to_html({
                    'title': title,
                    'col': col,
                    'row': row,
                    'data': res
                })
            elif da.shape[1] > 1:
                statistic, pvalue = stats.power_divergence(data, axis=1)
                title = '单样本卡方检验'
                col = ['卡方', '显著性']
                row = X
                d = pd.DataFrame([statistic, pvalue]).T
                d = d.astype(float)
                d = format_data_col(d)
                res = d.values.tolist()
                results = transform_table_data_to_html({
                    'title': title,
                    'col': col,
                    'row': row,
                    'data': res
                })
            log.info("无期望频率情况分析完成")

        elif button_type[0] == 'select':
            te = exec_sql(table_name, X)
            te = te.astype(float)
            test = [te[i] for i in X]
            ex = exec_sql(table_name, E)
            ex = ex.astype(float)
            expect = [ex[j] for j in E]
            log.info("输入数据大小:{}".format(len(test)))
            if te.shape[1] == 1:
                statistic, pvalue = stats.power_divergence(test,expect, axis=0)
                title = '单样本卡方检验'
                col = ['卡方', '显著性']
                row = X
                d = pd.DataFrame([statistic, pvalue]).T
                d = d.astype(float)
                d = format_data_col(d)
                res = d.values.tolist()
                results = transform_table_data_to_html({
                    'title': title,
                    'col': col,
                    'row': row,
                    'data': res
                })
            elif te.shape[1] > 1:
                statistic, pvalue = stats.power_divergence(test,expect, axis=1)
                title = '单样本卡方检验'
                col = ['卡方', '显著性']
                row = X
                d = pd.DataFrame([statistic, pvalue]).T
                d = d.astype(float)
                d = format_data_col(d)
                res = d.values.tolist()
                results = transform_table_data_to_html({
                    'title': title,
                    'col': col,
                    'row': row,
                    'data': res
                })
            log.info("有期望频率情况分析完成")

        elif button_type[0] == 'input':
            te = exec_sql(table_name, X)
            te = te.astype(float)
            test = [te[i] for i in X]
            expect = input_e
            expect = pd.DataFrame(expect)
            expect = expect.astype(float)
            expect = expect.values.tolist()
            log.info("输入数据大小:{}".format(len(test)))

            if te.shape[1] == 1:
                statistic, pvalue = stats.power_divergence(test,expect, axis=0)
                title = '单样本卡方检验'
                col = ['卡方', '显著性']
                row = X
                d = pd.DataFrame([statistic, pvalue]).T
                d = d.astype(float)
                d = format_data_col(d)
                res = d.values.tolist()
                results = transform_table_data_to_html({
                    'title': title,
                    'col': col,
                    'row': row,
                    'data': res
                })
            elif te.shape[1] > 1:
                statistic, pvalue = stats.power_divergence(test,expect, axis=1)
                title = '单样本卡方检验'
                col = ['卡方', '显著性']
                row = X
                d = pd.DataFrame([statistic, pvalue]).T
                d = d.astype(float)
                d = format_data_col(d)
                res = d.values.tolist()
                results =  transform_table_data_to_html({
                    'title': title,
                    'col': col,
                    'row': row,
                    'data': res
                })
            log.info("用户输入的期望频率情况分析完成")
        response_data = {
                             "code": "200",
                             "msg": "ok!",
                             "res":results}
        return jsonify(response_data)
    except Exception as e:
        log.error(e)
        raise e
Ejemplo n.º 9
0
def cross_chis(index, columns, fenceng):
    chi_res = []
    expect = []

    # 多层交叉表
    cross_result = pd.crosstab(index=index, columns=columns, margins=True)
    corss_index = cross_result.index.tolist()
    corss_index[-1] = '总计'
    corss_columns = cross_result.columns.tolist()
    corss_columns[-1] = '总计'
    corss_value = cross_result.values.tolist()

    # 交叉表分析
    cr_re = pd.crosstab(index=index, columns=columns,
                        margins=False)  # 给模型的不能有汇总列
    first_index = np.unique(index[0])
    for i in first_index:
        chis_pearson, p_value_pearson, dof_pearson, expect_pearson = chi2_contingency(
            cr_re.loc[i, :], correction=True, lambda_='pearson')
        chis_log, p_value_log, dof_log, expect_log = chi2_contingency(
            cr_re.loc[i, :], correction=True, lambda_='log-likelihood')
        chis_ftukey, p_value_ftukey, dof_ftukey, expect_ftukey = chi2_contingency(
            cr_re.loc[i, :], correction=True, lambda_='freeman-tukey')
        chis_mll, p_value_mll, dof_mll, expect_mll = chi2_contingency(
            cr_re.loc[i, :], correction=True, lambda_='mod-log-likelihood')
        chis_neyman, p_value_neyman, dof_neyman, expect_neyman = chi2_contingency(
            cr_re.loc[i, :], correction=True, lambda_='neyman')
        chis_cr, p_value_cr, dof_cr, expect_cr = chi2_contingency(
            cr_re.loc[i, :], correction=True, lambda_='cressie-read')

        chi_res.append([
            "{:.4f}".format(chis_pearson), "{:.4f}".format(p_value_pearson),
            dof_pearson
        ])
        chi_res.append(
            ["{:.4f}".format(chis_log), "{:.4f}".format(p_value_log), dof_log])
        chi_res.append([
            "{:.4f}".format(chis_ftukey), "{:.4f}".format(p_value_ftukey),
            dof_ftukey
        ])
        chi_res.append(
            ["{:.4f}".format(chis_mll), "{:.4f}".format(p_value_mll), dof_mll])
        chi_res.append([
            "{:.4f}".format(chis_neyman), "{:.4f}".format(p_value_neyman),
            dof_neyman
        ])
        chi_res.append(
            ["{:.4f}".format(chis_cr), "{:.4f}".format(p_value_cr), dof_cr])

        for j in expect_pearson:
            expect.append(j)
        # expect.extend(expect_pearson.tolist())
    expect = pd.DataFrame(expect)  #.astype(float)
    expect = sum_data(expect)
    expect = format_data_col(expect).values.tolist()
    # row = ["pearson","log-likelihood","freeman-tukey","mod-log-likelihood","neyman","cressie-read"]*len(first_index)
    row = []

    method = [
        "pearson", "log-likelihood", "freeman-tukey", "mod-log-likelihood",
        "neyman", "cressie-read"
    ]
    for uindex in first_index:
        for m in method:
            row.append(fenceng[0] + '_' + uindex + ':' + m)

    r1 = {
        'title': "交叉表",
        'row': ["/".join(["{}".format(d) for d in c]) for c in corss_index],
        'col': corss_columns,
        'data': corss_value
    }
    r1 = transform_table_data_to_html(r1)

    r2 = {
        'title': "期望频数表",
        'row': ["/".join(["{}".format(d) for d in c]) for c in corss_index],
        'col': corss_columns[1:],
        'data': expect
    }
    r2 = transform_table_data_to_html(r2)
    r3 = {
        'title': "卡方检验",
        'row': row,
        'col': ['值', '显著性', '自由度'],
        'data': chi_res
    }
    r3 = transform_table_data_to_html(r3)
    return [r1, r2, r3]  # expect