Example #1
0
def doctovec_similar_sentence(table, model, text_col, label_col):

    df = table.copy()
    result_sim = {}

    for i in range(10):
        temp = {}
        temp['sentence'] = []
        temp['label'] = []
        for id, vec in model.docvecs.most_similar(i):
            temp['sentence'].append(df.at[id, text_col])
            temp['label'].append(df.at[id, label_col])
        result_sim[i] = pd.DataFrame(temp)

    str_MD = '## Most similar sentences \n'

    for i in range(10):
        str_MD += '|' + df.at[i, 'document'] + '\n'
        str_MD += '|' + pandasDF2MD(result_sim[i]) + '\n'
    rb = ReportBuilder()
    rb.addMD(strip_margin(str_MD))

    _model = _model_dict('doc2vec')
    _model['report'] = rb.get()

    return {'model': _model}
Example #2
0
def bartletts_test(table, response_cols, factor_col):
    groups = table[factor_col].unique()
    
    data_list = []
    stat_list = []
    p_list = []
    for response_col in response_cols:
        response = table[response_col]
        stat_bart, p_bart = bartlett(*[response[table[factor_col] == group] for group in groups])
        data = '{response_col} by {factor_col}'.format(response_col=response_col, factor_col=factor_col)
        data_list.append(data)
        stat_list.append(stat_bart)
        p_list.append(p_bart)
        
    result_table = pd.DataFrame.from_items([ 
        ['data', data_list],
        ['estimate', stat_list],
        ['p_value', p_list] 
    ])
    
    result = dict()
    result['result_table'] = result_table
    
    rb = ReportBuilder()
    rb.addMD(strip_margin("""
    ## Bartlett's Test Result
    | - H0: k population variances are equal.
    | - H1: at least two variances are different.
    |
    | {result_table}
    """.format(result_table=pandasDF2MD(result_table))))
    
    result['report'] = rb.get()
        
    return {'result': result}
Example #3
0
def _oneway_anova(table, response_cols, factor_col):
    rb = ReportBuilder()
    rb.addMD(strip_margin("""
    ## One-way Analysis of Variance Result
    """))
    groups = table[factor_col].unique()
    groups.sort()
    sum_len = np.sum([ len(str(group)) for group in groups ])
    
    result = dict()
    result['_grouped_data'] = dict()
    
    for response_col in response_cols:
        data = table[response_col]
        result['_grouped_data'][response_col] = dict()
        
        ax = sns.boxplot(x=factor_col, y=response_col, data=table, order=groups)
        if sum_len > 512:
            ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
        elif sum_len > 64:
            ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
            
        fig_box = plt2MD(plt)
        plt.clf()
        
        model = ols("""Q('{response_col}') ~ C(Q('{factor_col}'))""".format(response_col=response_col, factor_col=factor_col), table).fit()  # TODO factor_col = class => error
        anova = anova_lm(model)
        
        anova_df = pandasDF2MD(anova)
        
        p_value = anova["""PR(>F)"""][0]
        
        residual = model.resid
        
        sns.distplot(residual)
        distplot = plt2MD(plt)
        plt.clf()
        
        sm.qqplot(residual, line='s')
        qqplot = plt2MD(plt)
        plt.clf()
            
        rb.addMD(strip_margin("""
        | ## {response_col} by {factor_col}
        | {fig_box}
        |
        | ### ANOVA
        | {anova_df}
        | 
        | ### Diagnostics
        | {distplot}
        |
        | {qqplot}
        """.format(response_col=response_col, factor_col=factor_col, fig_box=fig_box, anova_df=anova_df, distplot=distplot, qqplot=qqplot)))
        
        result['_grouped_data'][response_col]['p_value'] = p_value
        
    result['report'] = rb.get()
    return {'result': result}
Example #4
0
def _evaluate_regression(table, label_col, prediction_col):
    label = table[label_col]
    predict = table[prediction_col]

    # compute metrics
    evs = explained_variance_score(label, predict)
    mae = mean_absolute_error(label, predict)
    mse = mean_squared_error(label, predict)
    mdae = median_absolute_error(label, predict)
    r2 = r2_score(label, predict)

    # json
    summary = dict()
    summary['label_col'] = label_col
    summary['prediction_col'] = prediction_col
    summary['r2_score'] = r2
    summary['mean_squared_error'] = mse
    summary['mean_absolute_error'] = mae
    summary['median_absolute_error'] = mdae
    summary['explained_variance_score'] = evs

    # report
    all_dict_list = [{
        'r2_score': r2,
        'mean_squared_error': mse,
        'mean_absolute_error': mae,
        'median_absolute_error': mdae,
        'explained_variance_score': evs
    }]
    all_df = pd.DataFrame(all_dict_list)
    all_df = all_df[[
        'r2_score', 'mean_squared_error', 'mean_absolute_error',
        'median_absolute_error', 'explained_variance_score'
    ]]
    summary['metrics'] = all_df

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Evaluate Regression Result
    | ### Metrics
    | {table1}
    |
    |
    """.format(table1=pandasDF2MD(all_df))))
    summary['report'] = rb.get()

    return {'result': summary}
Example #5
0
def _hierarchical_clustering_post(table, model, num_clusters, cluster_col='prediction'):
    Z = model['model']
    input_cols = model['input_cols']
    params = model['parameters']
    out_table = model['outtable']
    predict = fcluster(Z, t=num_clusters, criterion='maxclust')
    out_table2 = table.copy()
    out_table2[cluster_col] = predict
    
    L, M = leaders(Z, predict)
    which_cluster = []
    for leader in L:
        if leader in Z[:, 0]:
            select_indices = np.where(Z[:, 0] == leader)[0][0]
            which_cluster.append(out_table['joined_column1'][select_indices])
        elif leader in Z[:, 1]:
            select_indices = np.where(Z[:, 1] == leader)[0][0]
            which_cluster.append(out_table['joined_column2'][select_indices])
    
    out_table3 = pd.DataFrame([])
    out_table3[cluster_col] = M
    out_table3['name_of_clusters'] = which_cluster
    out_table3 = out_table3.sort_values(cluster_col)
    cluster_count = np.bincount(out_table2[cluster_col])
    cluster_count = cluster_count[cluster_count != 0]
    # data = {'cluster_name': ['prediction' + str(i) for i in range(1, num_clusters + 1)]}
    out_table3['num_of_entities'] = list(cluster_count)
    
    rb = ReportBuilder()
    rb.addMD(strip_margin("""### Hierarchical Clustering Post Process Result"""))
    rb.addMD(strip_margin("""
    |### Parameters
    |
    |{display_params}
    |
    |## Clusters Information
    |
    |{out_table3}
    |
    """.format(display_params=dict2MD(params), out_table3=pandasDF2MD(out_table3))))

    model = _model_dict('hierarchical_clustering_post')
    model['report'] = rb.get()
    
    return {'out_table2' : out_table2, 'model': model}
Example #6
0
def tfidf_train(table,
                tokens_col,
                tf_weighing='n',
                df_weighing='t',
                document_normalization='c'):

    out_table = table.copy()
    _corpus = out_table[tokens_col]
    _smartirs = tf_weighing + df_weighing + document_normalization

    _dictionary = Dictionary(_corpus)
    _corpus = [_dictionary.doc2bow(text) for text in _corpus]

    _model = TfidfModel(_corpus, smartirs=_smartirs)
    _corpus = [text for text in _model[_corpus]]

    _sparse_matrix = corpus2csc(_corpus, num_terms=len(_dictionary.token2id)).T

    _values = [value for value in _dictionary.values()]
    _keys = [key for key in _dictionary.keys()]
    _dic = pd.DataFrame({'indice': _keys, 'word': _values})
    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Dictionary
    | {table1}
    """.format(table1=pandasDF2MD(_dic))))

    out_table['sparse_vectors'] = sparse_encode(
        _sparse_matrix)['sparse_vectors']

    fit_model = dict()
    fit_model['dictionary'] = _dictionary
    fit_model['model'] = _model
    fit_model['report'] = rb.get()
    return {'out_table': out_table, 'fit_model': fit_model}
Example #7
0
def _pca(table,
         input_cols,
         new_column_name='projected_',
         n_components=None,
         copy=True,
         whiten=False,
         svd_solver='auto',
         tol=0.0,
         iterated_power='auto',
         random_state=None,
         hue=None,
         alpha=0,
         key_col=None):

    num_feature_cols = len(input_cols)
    if n_components is None:
        n_components = num_feature_cols

    pca = PCA(None, copy, whiten, svd_solver, tol, iterated_power,
              random_state)
    pca_model = pca.fit(table[input_cols])

    column_names = []
    for i in range(0, n_components):
        column_names.append(new_column_name + str(i))
    # print(column_names)

    pca_result = pca_model.transform(table[input_cols])
    out_df = pd.DataFrame(data=pca_result[:, :n_components],
                          columns=[column_names])

    out_df = pd.concat([table.reset_index(drop=True), out_df], axis=1)
    out_df.columns = table.columns.values.tolist() + column_names

    res_components = pca_model.components_
    res_components_df = pd.DataFrame(data=res_components[:n_components],
                                     columns=[input_cols])
    res_explained_variance = pca_model.explained_variance_
    res_explained_variance_ratio = pca_model.explained_variance_ratio_
    res_singular_values = pca_model.singular_values_
    res_mean = pca_model.mean_
    res_n_components = pca_model.n_components_
    res_noise_variance = pca_model.noise_variance_

    res_get_param = pca_model.get_params()
    res_get_covariance = pca_model.get_covariance()
    res_get_precision = pca_model.get_precision()

    # visualization
    plt.figure()
    if n_components == 1:
        sns.scatterplot(column_names[0], column_names[0], hue=hue, data=out_df)
        plt_two = plt2MD(plt)
        plt.clf()
    else:
        plt_two = _biplot(
            0,
            1,
            pc_columns=column_names,
            columns=input_cols,
            singular_values=res_singular_values,
            components=res_components,
            explained_variance_ratio=res_explained_variance_ratio,
            alpha=alpha,
            hue=hue,
            data=out_df,
            ax=plt.gca(),
            key_col=key_col)

    plt.figure()
    fig_scree = _screeplot(res_explained_variance,
                           res_explained_variance_ratio, n_components)

    table_explained_variance = pd.DataFrame(res_explained_variance,
                                            columns=['explained_variance'])
    table_explained_variance[
        'explained_variance_ratio'] = res_explained_variance_ratio
    table_explained_variance[
        'cum_explained_variance_ratio'] = res_explained_variance_ratio.cumsum(
        )

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## PCA Result
    | ### Plot
    | {image1}
    |
    | ### Explained Variance
    | {fig_scree}
    | {table_explained_variance}    
    |
    | ### Components
    | {table2}
    |
    | ### Parameters
    | {parameter1}
    """.format(image1=plt_two,
               fig_scree=fig_scree,
               table_explained_variance=pandasDF2MD(table_explained_variance),
               parameter1=dict2MD(res_get_param),
               table2=pandasDF2MD(res_components_df))))

    model = _model_dict('pca')
    model['components'] = res_components
    model['explained_variance'] = res_explained_variance
    model['explained_variance_ratio'] = res_explained_variance_ratio
    model['singular_values'] = res_singular_values
    model['mean'] = res_mean
    model['n_components'] = res_n_components
    model['noise_variance'] = res_noise_variance
    model['parameters'] = res_get_param
    model['covariance'] = res_get_covariance
    model['precision'] = res_get_precision
    model['report'] = rb.get()
    model['pca_model'] = pca_model
    model['input_cols'] = input_cols

    return {'out_table': out_df, 'model': model}
Example #8
0
def _logistic_regression_train(table,
                               feature_cols,
                               label_col,
                               penalty='l2',
                               dual=False,
                               tol=0.0001,
                               C=1.0,
                               fit_intercept=True,
                               intercept_scaling=1,
                               class_weight=None,
                               random_state=None,
                               solver='liblinear',
                               max_iter=100,
                               multi_class='ovr',
                               verbose=0,
                               warm_start=False,
                               n_jobs=1):
    features = table[feature_cols]
    label = table[label_col]
    lr_model = LogisticRegression(penalty, dual, tol, C, fit_intercept,
                                  intercept_scaling, class_weight,
                                  random_state, solver, max_iter, multi_class,
                                  verbose, warm_start, n_jobs)
    lr_model.fit(features, label)

    featureNames = np.append("Intercept", feature_cols)
    intercept = lr_model.intercept_
    coefficients = lr_model.coef_
    classes = lr_model.classes_
    is_binary = len(classes) == 2

    if (fit_intercept == True):
        summary = pd.DataFrame({'features': ['intercept'] + feature_cols})
        print(intercept)
        print(coefficients)

        coef_trans = np.concatenate(([intercept], np.transpose(coefficients)),
                                    axis=0)
        if not is_binary:
            summary = pd.concat(
                (summary, pd.DataFrame(coef_trans, columns=classes)), axis=1)
        elif is_binary:
            summary = pd.concat(
                (summary, pd.DataFrame(coef_trans, columns=[classes[0]])),
                axis=1)

    else:
        summary = pd.DataFrame({'features': feature_cols})
        coef_trans = np.transpose(coefficients)

        if not is_binary:
            summary = pd.concat(
                (summary, pd.DataFrame(coef_trans, columns=classes)), axis=1)
        elif is_binary:
            summary = pd.concat(
                (summary, pd.DataFrame(coef_trans, columns=[classes[0]])),
                axis=1)

    prob = lr_model.predict_proba(features)

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Logistic Regression Result
    | ### Summary
    | {table1}
    """.format(table1=pandasDF2MD(summary))))

    model = dict()
    model['features'] = feature_cols
    model['label'] = label_col
    model['intercept'] = lr_model.intercept_
    model['coefficients'] = lr_model.coef_
    model['class'] = lr_model.classes_
    model['penalty'] = penalty
    model['solver'] = solver
    model['lr_model'] = lr_model
    model['report'] = rb.get()

    return {'model': model}
Example #9
0
def _pca(table,
         input_cols,
         new_column_name='projected_',
         n_components=None,
         copy=True,
         whiten=False,
         svd_solver='auto',
         tol=0.0,
         iterated_power='auto',
         random_state=None):

    num_feature_cols = len(input_cols)
    if n_components is None:
        n_components = num_feature_cols

    pca = PCA(n_components, copy, whiten, svd_solver, tol, iterated_power,
              random_state)
    pca_model = pca.fit(table[input_cols])

    column_names = []
    for i in range(0, n_components):
        column_names.append(new_column_name + str(i))
    # print(column_names)

    pca_result = pca_model.transform(table[input_cols])
    out_df = pd.DataFrame(data=pca_result, columns=[column_names])

    res_components = pca_model.components_
    res_components_df = pd.DataFrame(data=res_components, columns=[input_cols])
    res_explained_variance = pca_model.explained_variance_
    res_explained_variance_ratio = pca_model.explained_variance_ratio_
    res_singular_values = pca_model.singular_values_
    res_mean = pca_model.mean_
    res_n_components = pca_model.n_components_
    res_noise_variance = pca_model.noise_variance_

    res_get_param = pca_model.get_params()
    res_get_covariance = pca_model.get_covariance()
    res_get_precision = pca_model.get_precision()

    # visualization
    plt.figure()
    if res_n_components == 1:
        plt.scatter(pca_result[:, 0], pca_result[:, 0])
    else:
        plt.scatter(pca_result[:, 0], pca_result[:, 1])
    # plt.title('PCA result with two components')
    # plt.show()
    plt_two = plt2MD(plt)
    plt.clf()

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | 
    | ### Plot
    | The x-axis and y-axis of the following plot is projected0 and projected1, respectively.    
    | {image1}
    |
    | ### Result
    | {table1}
    | only showing top 20 rows
    |
    | ### Parameters
    | {parameter1}
    |
    | ### Components
    | {table2}
    | 
    | ### Mean
    | {array1}
    | 
    | ### Explained Variance 
    | {array2}
    |
    """.format(table1=pandasDF2MD(out_df, 20),
               image1=plt_two,
               parameter1=dict2MD(res_get_param),
               table2=pandasDF2MD(res_components_df),
               array1=res_mean,
               array2=res_explained_variance)))

    model = _model_dict('pca')
    model['components'] = res_components
    model['explained_variance'] = res_explained_variance
    model['explained_variance_ratio'] = res_explained_variance_ratio
    model['singular_values'] = res_singular_values
    model['mean'] = res_mean
    model['n_components'] = res_n_components
    model['noise_variance'] = res_noise_variance
    model['parameters'] = res_get_param
    model['covariance'] = res_get_covariance
    model['precision'] = res_get_precision
    model['report'] = rb.get()
    model['pca_model'] = pca_model
    model['input_cols'] = input_cols

    out_df = pd.concat([table.reset_index(drop=True), out_df], axis=1)
    out_df.columns = table.columns.values.tolist() + column_names

    return {'out_table': out_df, 'model': model}
Example #10
0
def one_sample_ttest(table,
                     input_cols,
                     alternatives,
                     hypothesized_mean=0,
                     conf_level=0.95):

    n = len(table)
    degree = n - 1
    alpha = 1.0 - conf_level
    out_table = pd.DataFrame()

    # statistics
    statistics = "t statistic, t distribution with %d degrees of freedom under the null hypothesis." % degree

    # Print model
    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    ## One Sample T Test Result
    | - Statistics = {s}
    | - Hypothesized mean = {h} 
    | - Confidence level = {cl}
    """.format(s=statistics, h=hypothesized_mean, cl=conf_level)))

    for input_col in input_cols:
        # model
        alter_list = []
        p_list = []
        CI_list = []

        # data
        data = input_col

        # estimates
        result = stats.ttest_1samp(table[input_col], hypothesized_mean)
        estimates = result[0]

        cols = [
            'data', 'alternative_hypothesis', 'statistics', 'estimates',
            'p_value', 'confidence_level', 'lower_confidence_interval',
            'upper_confidence_interval'
        ]

        for i in alternatives:
            if (i == 'Greater'):
                # alternative hypothesis
                alternative_hypothesis = "true mean >" + str(hypothesized_mean)
                # p-values
                p_value = 1.0 - t.cdf(estimates, degree)
                # confidence interval - greater
                critical_val = t.ppf(1.0 - alpha, degree)
                width = critical_val * np.std(
                    table[input_col]) / math.sqrt(n - 1)
                lower_conf_interval = np.mean(table[input_col]) - width
                upper_conf_interval = math.inf

                # model
                alter = 'true mean > {hypothesized_mean}'.format(
                    hypothesized_mean=hypothesized_mean)
                alter_list.append(alter)
                p_list.append(p_value)
                conf_interval = '({lower_conf_interval}, {upper_conf_interval})'.format(
                    lower_conf_interval=lower_conf_interval,
                    upper_conf_interval=upper_conf_interval)
                CI_list.append(conf_interval)
                # out_table
                list = []
                list.append([
                    data, alternative_hypothesis, statistics, estimates,
                    p_value, conf_level, lower_conf_interval,
                    upper_conf_interval
                ])
                out_table = out_table.append(pd.DataFrame(list, columns=cols))

            if (i == 'Less'):
                # alternative hypothesis
                alternative_hypothesis = "true mean <" + str(hypothesized_mean)
                p_value = t.cdf(estimates, degree)
                # confidence interval - less
                critical_val = t.ppf(1.0 - alpha, degree)
                width = critical_val * np.std(
                    table[input_col]) / math.sqrt(n - 1)
                lower_conf_interval = -math.inf
                upper_conf_interval = np.mean(table[input_col]) + width

                # model
                alter = 'true mean < {hypothesized_mean}'.format(
                    hypothesized_mean=hypothesized_mean)
                alter_list.append(alter)
                p_list.append(p_value)
                conf_interval = '({lower_conf_interval}, {upper_conf_interval})'.format(
                    lower_conf_interval=lower_conf_interval,
                    upper_conf_interval=upper_conf_interval)
                CI_list.append(conf_interval)
                # out_table
                list = []
                list.append([
                    data, alternative_hypothesis, statistics, estimates,
                    p_value, conf_level, lower_conf_interval,
                    upper_conf_interval
                ])
                out_table = out_table.append(pd.DataFrame(list, columns=cols))

            if (i == 'Two Sided'):
                # alternative hypothesis
                alternative_hypothesis = "true mean !=" + str(
                    hypothesized_mean)
                # p_value = (1.0 - t.cdf(abs(estimates), degree)) * 2.0
                if (estimates >= 0):
                    p_value = 2.0 * t.cdf(-estimates, degree)
                else:
                    p_value = 2.0 * t.cdf(estimates, degree)
                # confidence interval - two-sided
                critical_val = t.ppf(1.0 - alpha / 2, degree)
                width = critical_val * np.std(
                    table[input_col]) / math.sqrt(n - 1)
                lower_conf_interval = np.mean(table[input_col]) - width
                upper_conf_interval = np.mean(table[input_col]) + width

                # model
                alter = 'true mean != {hypothesized_mean}'.format(
                    hypothesized_mean=hypothesized_mean)
                alter_list.append(alter)
                p_list.append(p_value)
                conf_interval = '({lower_conf_interval}, {upper_conf_interval})'.format(
                    lower_conf_interval=lower_conf_interval,
                    upper_conf_interval=upper_conf_interval)
                CI_list.append(conf_interval)
                # out_table
                list = []
                list.append([
                    data, alternative_hypothesis, statistics, estimates,
                    p_value, conf_level, lower_conf_interval,
                    upper_conf_interval
                ])
                out_table = out_table.append(pd.DataFrame(list, columns=cols))

        # Print model
        conf_level_percent = conf_level * 100
        result_table = pd.DataFrame.from_items(
            [['alternative hypothesis', alter_list], ['p-value', p_list],
             ['%g%% confidence Interval' % conf_level_percent, CI_list]])

        result = dict()
        result['result_table'] = result_table
        rb.addMD(
            strip_margin("""
        ### Data = {input_col}
        | - Estimates = {estimates} 
        |
        | {result_table}
        """.format(input_col=input_col,
                   estimates=estimates,
                   result_table=pandasDF2MD(result_table))))

    # print model
    result['report'] = rb.get()

    return {'out_table': out_table, 'model': result}
Example #11
0
def two_sample_ttest_for_stacked_data(table,
                                      response_cols,
                                      factor_col,
                                      alternatives,
                                      first,
                                      second,
                                      hypo_diff=0,
                                      equal_vari='pooled',
                                      confi_level=0.95):

    if (type(table[factor_col][0]) == str):
        table_first = table[table[factor_col] == first]
        table_second = table[table[factor_col] == second]
    elif (type(table[factor_col][0]) == bool):
        table_first = table[table[factor_col] == bool(first)]
        table_second = table[table[factor_col] == bool(second)]
    else:
        table_first = table[table[factor_col] == float(first)]
        table_second = table[table[factor_col] == float(second)]

    tmp_table = []

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    ## Two Sample T Test for Stacked Data Result
    | - Hypothesized mean = {hypo_diff}
    | - Confidence level = {confi_level}
    """.format(hypo_diff=hypo_diff, confi_level=confi_level)))

    for response_col in response_cols:
        tmp_model = []
        number1 = len(table_first[response_col])
        number2 = len(table_second[response_col])
        mean1 = (table_first[response_col]).mean()
        mean2 = (table_second[response_col]).mean()
        std1 = (table_first[response_col]).std()
        std2 = (table_second[response_col]).std()
        start_auto = 0
        if (equal_vari == 'auto'):
            start_auto = 1
            f_value = (std1**2) / (std2**2)
            f_test_p_value_tmp = stats.f.cdf(1 / f_value, number1 - 1,
                                             number2 - 1)
            if (f_test_p_value_tmp > 0.5):
                f_test_p_value = (1 - f_test_p_value_tmp) * 2
            else:
                f_test_p_value = f_test_p_value_tmp * 2
            if (f_test_p_value < 0.05):
                equal_vari = 'unequal'
            else:
                equal_vari = 'pooled'
        ttestresult = ttest_ind(table_first[response_col],
                                table_second[response_col],
                                'larger',
                                usevar=equal_vari,
                                value=hypo_diff)

        if 'larger' in alternatives:
            ttestresult = ttest_ind(table_first[response_col],
                                    table_second[response_col],
                                    'larger',
                                    usevar=equal_vari,
                                    value=hypo_diff)
            df = ttestresult[2]
            if (equal_vari == 'pooled'):
                std_number1number2 = sqrt(
                    ((number1 - 1) * (std1)**2 + (number2 - 1) *
                     (std2)**2) / (number1 + number2 - 2))
                margin = t.ppf(
                    (confi_level),
                    df) * std_number1number2 * sqrt(1 / number1 + 1 / number2)
            if (equal_vari == 'unequal'):
                margin = t.ppf(
                    (confi_level), df) * sqrt(std1**2 / (number1) + std2**2 /
                                              (number2))
            tmp_model += [['true difference in means > 0.0'] +
                          [ttestresult[1]] +
                          [(mean1 - mean2 - margin, math.inf)]]
            tmp_table += [[
                '%s by %s(%s,%s)' % (response_col, factor_col, first, second)
            ] + ['true difference in means > 0.0'] + [
                't statistic, t distribution with %f degrees of freedom under the null hypothesis'
                % ttestresult[2]
            ] + [ttestresult[0]] + [ttestresult[1]] + [confi_level] +
                          [mean1 - mean2 - margin] + [math.inf]]

        if 'smaller' in alternatives:
            ttestresult = ttest_ind(table_first[response_col],
                                    table_second[response_col],
                                    'smaller',
                                    usevar=equal_vari,
                                    value=hypo_diff)
            df = ttestresult[2]
            if (equal_vari == 'pooled'):
                std_number1number2 = sqrt(
                    ((number1 - 1) * (std1)**2 + (number2 - 1) *
                     (std2)**2) / (number1 + number2 - 2))
                margin = t.ppf(
                    (confi_level),
                    df) * std_number1number2 * sqrt(1 / number1 + 1 / number2)
            if (equal_vari == 'unequal'):
                margin = t.ppf(
                    (confi_level), df) * sqrt(std1**2 / (number1) + std2**2 /
                                              (number2))
            tmp_model += [['true difference in means < 0.0'] +
                          [ttestresult[1]] +
                          [(-math.inf, mean1 - mean2 + margin)]]
            tmp_table += [[
                '%s by %s(%s,%s)' % (response_col, factor_col, first, second)
            ] + ['true difference in means < 0.0'] + [
                't statistic, t distribution with %f degrees of freedom under the null hypothesis'
                % ttestresult[2]
            ] + [ttestresult[0]] + [ttestresult[1]] + [confi_level] +
                          [-math.inf] + [mean1 - mean2 + margin]]

        if 'two-sided' in alternatives:
            ttestresult = ttest_ind(table_first[response_col],
                                    table_second[response_col],
                                    'two-sided',
                                    usevar=equal_vari,
                                    value=hypo_diff)
            df = ttestresult[2]
            if (equal_vari == 'pooled'):
                std_number1number2 = sqrt(
                    ((number1 - 1) * (std1)**2 + (number2 - 1) *
                     (std2)**2) / (number1 + number2 - 2))
                margin = t.ppf(
                    (confi_level),
                    df) * std_number1number2 * sqrt(1 / number1 + 1 / number2)
            if (equal_vari == 'unequal'):
                margin = t.ppf(
                    (confi_level), df) * sqrt(std1**2 / (number1) + std2**2 /
                                              (number2))
            tmp_model += [['true difference in means != 0.0'] +
                          [ttestresult[1]] +
                          [(mean1 - mean2 - margin, mean1 - mean2 + margin)]]
            tmp_table += [[
                '%s by %s(%s,%s)' % (response_col, factor_col, first, second)
            ] + ['true difference in means != 0.0'] + [
                't statistic, t distribution with %f degrees of freedom under the null hypothesis'
                % ttestresult[2]
            ] + [ttestresult[0]] + [ttestresult[1]] + [confi_level] +
                          [mean1 - mean2 - margin] + [mean1 - mean2 + margin]]

        result_model = pd.DataFrame.from_records(tmp_model)
        result_model.columns = [
            'alternatives', 'p values',
            '%g%% confidence interval' % (confi_level * 100)
        ]
        rb.addMD(
            strip_margin("""
        | #### Data = {response_col} by {factor_col}({first},{second})
        
        | - Statistics = t statistic, t distribution with {ttestresult2} degrees of freedom under the null hypothesis
        | - Estimates= {ttestresult0}
        |
        | {result_model}
        |
        """.format(ttestresult2=ttestresult[2],
                   response_col=response_col,
                   factor_col=factor_col,
                   first=first,
                   second=second,
                   ttestresult0=ttestresult[0],
                   result_model=pandasDF2MD(result_model))))
        if (start_auto == 1):
            equal_vari = 'auto'
    result = pd.DataFrame.from_records(tmp_table)
    result.columns = [
        'data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value',
        'confidence_level', 'lower_confidence_interval',
        'upper_confidence_interval'
    ]

    model = dict()
    model['report'] = rb.get()
    return {'out_table': result, 'model': model}
Example #12
0
def _correlation(table, vars, method='pearson', height=2.5, corr_prec=2):
    size = len(vars)

    s_default = plt.rcParams['lines.markersize']**2.
    scatter_kws = {"s": s_default * height / 6.4}

    result_arr = []

    for i in range(size):
        for j in range(i):
            if method == 'pearson':
                r, p = stats.pearsonr(table[vars[i]], table[vars[j]])
            elif method == 'spearman':
                r, p = stats.spearmanr(table[vars[i]], table[vars[j]])
            elif method == 'kendal':
                r, p = stats.kendalltau(table[vars[i]], table[vars[j]])

            result_arr.append([vars[i], vars[j], r, p])

    df_result = pd.DataFrame(result_arr, columns=['x', 'y', 'corr', 'p_value'])

    def corr(x, y, **kwargs):
        if kwargs['method'] == 'pearson':
            r, p = stats.pearsonr(x, y)
        elif kwargs['method'] == 'spearman':
            r, p = stats.spearmanr(x, y)
        elif kwargs['method'] == 'kendal':
            r, p = stats.kendalltau(x, y)

        p_stars = ''
        if p <= 0.05:
            p_stars = '*'
        if p <= 0.01:
            p_stars = '**'
        if p <= 0.001:
            p_stars = '***'

        corr_text = '{:.{prec}f}'.format(r, prec=corr_prec)
        font_size = abs(r) * 15 * 2 / corr_prec + 5
        ax = plt.gca()
        ax.annotate(corr_text, [
            .5,
            .5,
        ],
                    xycoords="axes fraction",
                    ha='center',
                    va='center',
                    fontsize=font_size * height)
        ax.annotate(p_stars,
                    xy=(0.65, 0.6),
                    xycoords=ax.transAxes,
                    color='red',
                    fontsize=17 * height)

    g = sns.PairGrid(table, vars=vars, height=height)
    g.map_diag(sns.distplot)
    if method == 'pearson':
        g.map_lower(sns.regplot, scatter_kws=scatter_kws)
    else:
        g.map_lower(sns.regplot, lowess=True, scatter_kws=scatter_kws)
    g.map_upper(corr, method=method)

    fig_corr = plt2MD(plt)
    plt.clf()

    rb = ReportBuilder()
    rb.addMD(
        strip_margin(""" ## Correlation Results
        | ### Correlation Matrix
        | {fig_corr}
        |
        | ### Correlation Table
        | {table}
        """.format(fig_corr=fig_corr, table=pandasDF2MD(df_result))))

    params = {'vars': vars, 'method': method, 'height': height}

    res = dict()
    res['params'] = params
    res['corr_table'] = df_result
    res['report'] = rb.get()

    return {'result': res}
Example #13
0
def _xgb_regression_train(table,
                          feature_cols,
                          label_col,
                          max_depth=3,
                          learning_rate=0.1,
                          n_estimators=100,
                          silent=True,
                          objectibe='reg:linear',
                          booster='gbtree',
                          n_jobs=1,
                          nthread=None,
                          gamma=0,
                          min_child_weight=1,
                          max_delta_step=0,
                          subsample=1,
                          colsample_bytree=1,
                          colsample_bylevel=1,
                          reg_alpha=0,
                          reg_lambda=1,
                          scale_pos_weight=1,
                          base_score=0.5,
                          random_state=0,
                          seed=None,
                          missing=None,
                          sample_weight=None,
                          eval_set=None,
                          eval_metric=None,
                          early_stopping_rounds=None,
                          verbose=True,
                          xgb_model=None,
                          sample_weight_eval_set=None):

    regressor = XGBRegressor(max_depth, learning_rate, n_estimators, silent,
                             objectibe, booster, n_jobs, nthread, gamma,
                             min_child_weight, max_delta_step, subsample,
                             colsample_bytree, colsample_bylevel, reg_alpha,
                             reg_lambda, scale_pos_weight, base_score,
                             random_state, seed, missing)
    regressor.fit(table[feature_cols], table[label_col], sample_weight,
                  eval_set, eval_metric, early_stopping_rounds, verbose,
                  xgb_model, sample_weight_eval_set)

    # json
    get_param = regressor.get_params()
    feature_importance = regressor.feature_importances_
    #     plt.rcdefaults()
    plot_importance(regressor)
    plt.tight_layout()
    fig_plot_importance = plt2MD(plt)
    plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(regressor)
    #     fig_plot_tree_UT = plt2MD(plt)
    #     plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(regressor, rankdir='LR')
    #     fig_plot_tree_LR = plt2MD(plt)
    #     plt.rcdefaults()
    #     plt.clf()

    out_model = _model_dict('xgb_regression_model')
    out_model['feature_cols'] = feature_cols
    out_model['label_col'] = label_col
    out_model['parameters'] = get_param
    out_model['feature_importance'] = feature_importance
    out_model['regressor'] = regressor
    out_model['plot_importance'] = fig_plot_importance
    #     out_model['plot_tree_UT'] = fig_plot_tree_UT
    #     out_model['plot_tree_LR'] = fig_plot_tree_LR
    #         out_model['to_graphviz'] = md_to_graphviz

    # report
    get_param_list = []
    get_param_list.append(['feature_cols', feature_cols])
    get_param_list.append(['label_col', label_col])
    for key, value in get_param.items():
        temp = [key, value]
        get_param_list.append(temp)
    get_param_df = pd.DataFrame(data=get_param_list,
                                columns=['parameter', 'value'])
    feature_importance_df = pd.DataFrame(data=feature_importance,
                                         index=feature_cols).T

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## XGB Regression Result
    |
    | ### Plot Importance
    | {image_importance}
    |
    | ### Feature Importance
    | {table_feature_importance}
    |
    | ### Parameters
    | {table_parameter}
    |
    """.format(image_importance=fig_plot_importance,
               table_feature_importance=pandasDF2MD(feature_importance_df, 20),
               table_parameter=pandasDF2MD(get_param_df))))
    out_model['report'] = rb.get()

    return {'model': out_model}
Example #14
0
def _evaluate_classification(table, label_col, prediction_col):

    label = table[label_col]
    predict = table[prediction_col]

    # compute metrics
    accuracy = accuracy_score(label, predict)
    f1 = f1_score(label, predict, average="weighted")
    precision = precision_score(label, predict, average="weighted")
    recall = recall_score(label, predict, average="weighted")
    class_names = np.unique(np.union1d(label.values, predict.values))

    # Plot non-normalized confusion matrix
    plt.figure()
    _plot_confusion_matrix(label,
                           predict,
                           classes=class_names,
                           title='Confusion matrix, without normalization')
    fig_cnf_matrix = plt2MD(plt)
    # Plot normalized confusion matrix
    plt.figure()
    _plot_confusion_matrix(label,
                           predict,
                           classes=class_names,
                           normalize=True,
                           title='Normalized confusion matrix')
    fig_cnf_matrix_normalized = plt2MD(plt)
    plt.clf()

    # json
    summary = dict()
    summary['label_col'] = label_col
    summary['prediction_col'] = prediction_col
    summary['f1_score'] = f1
    summary['accuracy_score'] = accuracy
    summary['precision_score'] = precision
    summary['recall_score'] = recall

    # report
    all_dict_list = [{
        'f1': f1,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall
    }]
    all_df = pd.DataFrame(all_dict_list)
    all_df = all_df[['f1', 'accuracy', 'precision', 'recall']]
    summary['metrics'] = all_df

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Evaluate Classification Result
    | ### Metrics
    | {table1}
    |
    | ### Confusion matrix
    | {fig_confusion_matrix}
    |
    | {fig_confusion_matrix_normalized}
    |
    """.format(table1=pandasDF2MD(all_df),
               fig_confusion_matrix=fig_cnf_matrix,
               fig_confusion_matrix_normalized=fig_cnf_matrix_normalized)))
    summary['report'] = rb.get()

    return {'result': summary}
Example #15
0
def paired_ttest(table, first_column, second_column, alternative, hypothesized_difference=0, confidence_level=0.95):
    df = len(table) - 1
    diff_mean = (table[first_column] - table[second_column]).mean()
    std_dev = np.std(table[first_column] - table[second_column])
    t_value = stats.ttest_rel(table[first_column], table[second_column] + hypothesized_difference)[0]
    p_value_ul = stats.ttest_rel(table[first_column], table[second_column] + hypothesized_difference)[1]
    p_value_u = stats.t.sf(t_value, df)
    p_value_l = stats.t.cdf(t_value, df)

    left_u = diff_mean - std_dev * stats.t.isf((1 - confidence_level), df) / np.sqrt(df)
    right_l = diff_mean + std_dev * stats.t.isf((1 - confidence_level), df) / np.sqrt(df)
    left_ul = diff_mean - std_dev * stats.t.isf((1 - confidence_level) / 2, df) / np.sqrt(df)
    right_ul = diff_mean + std_dev * stats.t.isf((1 - confidence_level) / 2, df) / np.sqrt(df)

    result_value_u = [{'data' : first_column + " , " + second_column,
                 'alternative_hypothesis' : "true difference in means > " + str(hypothesized_difference),
                 'statistics' : "t statistics, t distribution with " + str(df) + " degrees of freedom under the null hypothesis",
                 'estimates' : t_value,
                 'p_value' : p_value_u,
                 'confidence_level' : confidence_level,
                 'low_confidence_interval' : left_u,
                 'upper_confidence_interval' : np.Infinity}]
    result_value_l = [{'data' : first_column + " , " + second_column,
                 'alternative_hypothesis' : "true difference in means < " + str(hypothesized_difference),
                 'statistics' : "t statistics, t distribution with " + str(df) + " degrees of freedom under the null hypothesis",
                 'estimates' : t_value,
                 'p_value' : p_value_l,
                 'confidence_level' : confidence_level,
                 'low_confidence_interval' :-np.Infinity,
                 'upper_confidence_interval' : right_l}]
    result_value_ul = [{'data' : first_column + " , " + second_column,
                 'alternative_hypothesis' : "true difference in means != " + str(hypothesized_difference),
                 'statistics' : "t statistics, t distribution with " + str(df) + " degrees of freedom under the null hypothesis",
                 'estimates' : t_value,
                 'p_value' : p_value_ul,
                 'confidence_level' : confidence_level,
                 'low_confidence_interval' : left_ul,
                 'upper_confidence_interval' : right_ul}]

    df_result = pd.DataFrame()
    df_u = pd.DataFrame(result_value_u, columns=['data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'low_confidence_interval', 'upper_confidence_interval'])
    df_l = pd.DataFrame(result_value_l, columns=['data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'low_confidence_interval', 'upper_confidence_interval'])
    df_ul = pd.DataFrame(result_value_ul, columns=['data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'low_confidence_interval', 'upper_confidence_interval'])

    if 'greater' in alternative:
        df_result = df_result.append(df_u, ignore_index=True)
    if 'less' in alternative:
        df_result = df_result.append(df_l, ignore_index=True)
    if 'twosided' in alternative:
        df_result = df_result.append(df_ul, ignore_index=True)

    result_table_ul = pd.DataFrame([{'Alternative': 'Two Sided', 'H1': 'true difference in means != ' + str(hypothesized_difference), 't_value': t_value, 'p_value': p_value_ul, str(confidence_level * 100) + '% confidence interval': '(' + str(left_ul) + ', ' + str(right_ul) + ')'}])
    result_table_u = pd.DataFrame([{'Alternative': 'Greater', 'H1': 'true difference in means > ' + str(hypothesized_difference), 't_value': t_value, 'p_value': p_value_u, str(confidence_level * 100) + '% confidence interval': '(' + str(left_u) + ', ' + str(np.Infinity) + ')'}])
    result_table_l = pd.DataFrame([{'Alternative': 'Less', 'H1': 'true difference in means < ' + str(hypothesized_difference), 't_value': t_value, 'p_value': p_value_l, str(confidence_level * 100) + '% confidence interval': '(' + str(-np.Infinity) + ', ' + str(right_l) + ')'}])
    result_table = pd.DataFrame()

    if 'greater' in alternative:
        result_table = result_table.append(result_table_u, ignore_index=True)
    if 'less' in alternative:
        result_table = result_table.append(result_table_l, ignore_index=True)
    if 'twosided' in alternative:
        result_table = result_table.append(result_table_ul, ignore_index=True)

    ordered_result_table = pd.DataFrame(result_table, columns=['Alternative', 'H1', 't_value', 'p_value', str(confidence_level * 100) + '% confidence interval'])

    rb = ReportBuilder()
    rb.addMD(strip_margin("""
    |## Paired T Test Result
    |##### df : {deg_f}
    |##### Mean of differences : {dm}
    |##### Standard deviation : {sd}
    |
    |{result_table}
    |
    """.format(deg_f=df, dm=diff_mean, sd=std_dev, result_table=pandasDF2MD(ordered_result_table))))

    model = dict()
    model['report'] = rb.get()

    return{'out_table':df_result, 'model':model}
Example #16
0
def _hierarchical_clustering(table, input_cols, link='complete', met='euclidean', p=2, num_rows=20, figure_height=6.4, orient='right'):
    table = table.copy()
    df = table[input_cols]
    Z = linkage(df, method=link, metric=met)
    out_table = pd.DataFrame([])
    out_table['linkage_step'] = [x + 1 for x in reversed(range(len(Z)))]
    out_table['joined_column1'] = ['pt_' + str(int(Z[:, 0][i])) for i in range(len(Z))]
    out_table['joined_column2'] = ['pt_' + str(int(Z[:, 1][i])) for i in range(len(Z))]
    out_table['name_of_clusters'] = ['CL_' + str(i + 1) for i in reversed(range(len(Z)))]
    out_table['distance'] = [distance for distance in Z[:, 2]]
    out_table['number_of_original'] = [int(entities) for entities in Z[:, 3]]
    
    # switch name of  point to cluster name

    for i in range(len(Z)):
        if Z[:, 0][i] >= len(df) :
            out_table['joined_column1'][i] = out_table['name_of_clusters'][Z[:, 0][i] - len(df)]
        if Z[:, 1][i] >= len(df) :
            out_table['joined_column2'][i] = out_table['name_of_clusters'][Z[:, 1][i] - len(df)]
    out_table = out_table.reindex(index=out_table.index[::-1])[0:]
    out_table1 = out_table.head(num_rows)
    
    # calculate full dendrogram
    def _llf(id):
        n = len(df)
        if id < n:
                return 'pt_' + str(id)
 
    plt.figure(figsize=(8.4, figure_height))
    _fancy_dendrogram(
        Z,
        truncate_mode='none',  # show only the last p merged clusters (if another)
        get_leaves=True,
        orientation=orient,
        labels=True,
        leaf_label_func=_llf,
        leaf_rotation=45,
        leaf_font_size=5.,
        show_contracted=False,  # to get a distribution impression in truncated branches
        annotate_above=float(10),  # useful in small plots so annotations don't overlap
        # max_d=distance_threshold, # will plot a horizontal cut-off line, max_d as in max_distance
    )
    plt.title('Hierarchical Clustering Dendrogram')
    if orient=='top':
        plt.xlabel('Samples')
        plt.ylabel('Distance')
    elif orient=='right':
        plt.xlabel('Distance')
        plt.ylabel('Samples')
    
    plt2 = plt2MD(plt)
    plt.clf()
    
    rb = ReportBuilder()
    params = { 
        'Input Columns': input_cols,
        'Linkage Method': link,
        'Metric': met,
        'Number of Rows in Linkage Matrix': num_rows
    }
    rb.addMD(strip_margin("""### Hierarchical Clustering Result"""))
    rb.addMD(strip_margin("""
    |## Dendrogram
    |
    |{image}
    |
    |### Parameters
    |
    | {display_params}
    |
    |## Linkage Matrix
    |
    |{out_table1}
    |
    """.format(image=plt2, display_params=dict2MD(params), out_table1=pandasDF2MD(out_table1))))

    model = _model_dict('hierarchical_clustering')
    model['model'] = Z
    model['input_cols'] = input_cols
    model['parameters'] = params
    model['outtable'] = out_table
    model['report'] = rb.get()
        
    return { 'model':model}
def _naive_bayes_train(table,
                       feature_cols,
                       label_col,
                       alpha=1.0,
                       fit_prior=True,
                       class_prior=None):

    features = table[feature_cols]
    label = table[label_col]
    label_encoder = preprocessing.LabelEncoder()
    label_encoder.fit(label)
    label_correspond = label_encoder.transform(label)

    if class_prior is not None:
        tmp_class_prior = [0 for x in range(len(class_prior))]
        for elems in class_prior:
            tmp = elems.split(":")
            tmp_class_prior[label_encoder.transform([tmp[0]
                                                     ])[0]] = float(tmp[1])
        class_prior = tmp_class_prior

    nb_model = MultinomialNB(alpha, fit_prior, class_prior)
    nb_model.fit(features, label_correspond)
    class_log_prior = nb_model.class_log_prior_
    feature_log_prob_ = nb_model.feature_log_prob_
    tmp_result = np.hstack(
        (list(map(list, zip(*[label_encoder.classes_] + [class_log_prior]))),
         (feature_log_prob_)))
    column_names = ['labels', 'pi']
    for feature_col in feature_cols:
        column_names += ['theta_' + feature_col]
    result_table = pd.DataFrame.from_records(tmp_result, columns=column_names)
    prediction_correspond = nb_model.predict(features)

    get_param = dict()
    get_param['Lambda'] = alpha
    # get_param['Prior Probabilities of the Classes'] = class_prior
    get_param['Fit Class Prior Probability'] = fit_prior
    get_param['Feature Columns'] = feature_cols
    get_param['Label Column'] = label_col

    cnf_matrix = confusion_matrix(label_correspond, prediction_correspond)

    plt.figure()
    _plot_confusion_matrix(cnf_matrix,
                           classes=label_encoder.classes_,
                           title='Confusion Matrix')
    fig_confusion_matrix = plt2MD(plt)
    accuracy = nb_model.score(features, label_correspond) * 100

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Naive Bayes Classification Result
    |
    | ### Model:Multinomial
    | {result_table}
    | ### Parameters
    | {table_parameter} 
    | ### Predicted vs Actual
    | {image1}
    | #### Accuacy = {accuracy}%
    |
    """.format(image1=fig_confusion_matrix,
               accuracy=accuracy,
               result_table=pandasDF2MD(result_table),
               table_parameter=dict2MD(get_param))))

    model = _model_dict('naive_bayes_model')
    model['features'] = feature_cols
    model['label_col'] = label_col
    model['label_encoder'] = label_encoder
    model['nb_model'] = nb_model
    model['report'] = rb.get()

    return {'model': model}
Example #18
0
def _chi_square_test_of_independence(table,
                                     response_cols,
                                     factor_col,
                                     correction=False):
    label_list = []
    feature_list = []
    alternative_hypothesis_list = []
    dof_list = []
    stat_chi_list = []
    p_chi_list = []
    for response_col in response_cols:
        response = table[response_col]
        contingency_table = pd.crosstab(table[response_col],
                                        table[factor_col],
                                        margins=True)
        response_index = len(contingency_table) - 1
        factor_index = len(contingency_table.columns) - 1
        temporary = contingency_table.iloc[0:response_index, 0:factor_index]
        f_object = np.array(temporary)
        test = stats.chi2_contingency(f_object, correction, 1)[0:3]
        label = '{factor_col}'.format(factor_col=factor_col)
        feature = '{response_col}'.format(response_col=response_col)
        if test[1] < 0.05:
            dependence = 'Reject the null hypothesis that two categorical variables are independent at 5% significance level.'
        elif test[1] >= 0.05:
            dependence = 'No association was found between two categorical variables at 5% significance level.'
        elif math.isnan(test[1]):
            dependence = 'Independence of two categorical variables cannot be decided.'
        conclusion = '{dependence}'.format(dependence=dependence)
        alternative_hypothesis = 'Two categorical variables are dependent.'
        dof = 'chi-square distribution with {dof} degrees of freedom'.format(
            dof=test[2])
        stat_chi = '{stat_chi}'.format(stat_chi=test[0])
        p_chi = '{p_chi}'.format(p_chi=test[1])
        label_list.append(label)
        feature_list.append(feature)
        alternative_hypothesis_list.append(alternative_hypothesis)
        dof_list.append(dof)
        stat_chi_list.append(stat_chi)
        p_chi_list.append(p_chi)

    result_table = pd.DataFrame.from_items(
        [['label', label_list], ['feature', feature_list],
         ['alternative_hypothesis', alternative_hypothesis_list],
         ['df', dof_list], ['estimate', stat_chi_list],
         ['p_value', p_chi_list]])

    result = dict()
    result['result_table'] = result_table

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Chi-square Test of Independence Result
    |  - H0: the two categorical variables are independent.
    |  - H1: the two categorical variables are dependent.
    """))
    for response_col in response_cols:
        response = table[response_col]
        contingency_table = pd.crosstab(table[response_col],
                                        table[factor_col],
                                        margins=True)
        response_index = len(contingency_table) - 1
        factor_index = len(contingency_table.columns) - 1
        temporary = contingency_table.iloc[0:response_index, 0:factor_index]
        f_object = np.array(temporary)
        test = stats.chi2_contingency(f_object, correction, 1)[0:3]
        label = '{factor_col}'.format(factor_col=factor_col)
        feature = '{response_col}'.format(response_col=response_col)
        if test[1] < 0.05:
            dependence = 'Reject the null hypothesis that two categorical variables are independent at 5% significance level.'
        elif test[1] >= 0.05:
            dependence = 'No association was found between two categorical variables at 5% significance level.'
        elif math.isnan(test[1]):
            dependence = 'Independence of two categorical variables cannot be decided.'
        dof_simplelist = []
        stat_chi_simplelist = []
        p_chi_simplelist = []
        dof = '{dof}'.format(dof=test[2])
        stat_chi = '{stat_chi}'.format(stat_chi=test[0])
        p_chi = '{p_chi}'.format(p_chi=test[1])
        stat_chi_simplelist.append(stat_chi)
        dof_simplelist.append(dof)
        p_chi_simplelist.append(p_chi)
        result_table_simple = pd.DataFrame.from_items(
            [['estimate', stat_chi_simplelist], ['df', dof_simplelist],
             ['p_value', p_chi_simplelist]])

        # test statistic = {test_statistic}, df = {dof}, p_value = {p_value}
        # test_statistic = stats.chi2_contingency(f_object,correction,lambda_)[0], dof=stats.chi2_contingency(f_object,correction,lambda_)[2], p_value=stats.chi2_contingency(f_object,correction,lambda_)[1]
        rb.addMD(
            strip_margin("""
        |### Label: {label}, Feature: {feature}
        |  
        |{result_table_simple}
        |
        |{dependence}
        |
        |
        """.format(label=factor_col,
                   feature=response_col,
                   result_table_simple=pandasDF2MD(result_table_simple),
                   dependence=dependence)))

    model = _model_dict('Chi-square test of independence')

    model['report'] = rb.get()

    result_table = result_table.copy()

    return {'model': model}
Example #19
0
def ftest_for_stacked_data(table,
                           response_cols,
                           factor_col,
                           alternatives,
                           first,
                           second,
                           confi_level=0.95):
    if (type(table[factor_col][0]) == str):
        table_first = table[table[factor_col] == first]
        table_second = table[table[factor_col] == second]
    elif (type(table[factor_col][0]) == bool):
        table_first = table[table[factor_col] == bool(first)]
        table_second = table[table[factor_col] == bool(second)]
    else:
        table_first = table[table[factor_col] == float(first)]
        table_second = table[table[factor_col] == float(second)]

    tmp_table = []
    number1 = len(table_first[factor_col])
    number2 = len(table_second[factor_col])
    d_num = number1 - 1
    d_denum = number2 - 1
    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    ## F Test for Stacked Data Result
    | - Confidence level = {confi_level}
    | - Statistics = F statistic, F distribution with {d_num} numerator degrees of freedom and {d_denum} degrees of freedom under the null hypothesis
    """.format(confi_level=confi_level, d_num=d_num, d_denum=d_denum)))

    for response_col in response_cols:
        tmp_model = []
        std1 = (table_first[response_col]).std()
        std2 = (table_second[response_col]).std()
        f_value = (std1**2) / (std2**2)

        if 'larger' in alternatives:
            p_value = scipy.stats.f.cdf(1 / f_value, d_num, d_denum)
            tmp_model += [
                ['true ratio > 1'] + [p_value] +
                [(f_value /
                  (scipy.stats.f.ppf(confi_level, d_num, d_denum)), math.inf)]
            ]
            tmp_table += [[
                '%s by %s(%s,%s)' % (response_col, factor_col, first, second)
            ] + ['true ratio of variances > 1'] + [
                'F statistic, F distribution with %d numerator degrees of freedom and %d degrees of freedom under the null hypothesis.'
                % (d_num, d_denum)
            ] + [f_value] + [p_value] + [confi_level] + [
                f_value / (scipy.stats.f.ppf(confi_level, d_num, d_denum))
            ] + [math.inf]]

        if 'smaller' in alternatives:
            p_value = scipy.stats.f.cdf(f_value, d_num, d_denum)
            tmp_model += [['true ratio < 1'] + [p_value] +
                          [(0, f_value *
                            (scipy.stats.f.ppf(confi_level, d_denum, d_num)))]]
            tmp_table += [[
                '%s by %s(%s,%s)' % (response_col, factor_col, first, second)
            ] + ['true ratio of variances < 1'] + [
                'F statistic, F distribution with %d numerator degrees of freedom and %d degrees of freedom under the null hypothesis.'
                % (d_num, d_denum)
            ] + [f_value] + [p_value] + [confi_level] + [0] + [
                f_value * (scipy.stats.f.ppf(confi_level, d_denum, d_num))
            ]]

        if 'two-sided' in alternatives:
            p_value_tmp = scipy.stats.f.cdf(1 / f_value, d_num, d_denum)
            if (p_value_tmp > 0.5):
                p_value = (1 - p_value_tmp) * 2
            else:
                p_value = p_value_tmp * 2
            tmp_model += [
                ['true ratio != 1'] + [p_value] +
                [(f_value / (scipy.stats.f.ppf(
                    (1 + confi_level) / 2, d_num, d_denum)), f_value *
                  (scipy.stats.f.ppf((1 + confi_level) / 2, d_denum, d_num)))]
            ]
            tmp_table += [[
                '%s by %s(%s,%s)' % (response_col, factor_col, first, second)
            ] + ['true ratio of variances != 1'] + [
                'F statistic, F distribution with %d numerator degrees of freedom and %d degrees of freedom under the null hypothesis.'
                % (d_num, d_denum)
            ] + [f_value] + [p_value] + [confi_level] + [
                f_value / (scipy.stats.f.ppf(
                    (1 + confi_level) / 2, d_num, d_denum))
            ] + [
                f_value * (scipy.stats.f.ppf(
                    (1 + confi_level) / 2, d_denum, d_num))
            ]]

        result_model = pd.DataFrame.from_records(tmp_model)
        result_model.columns = [
            'alternatives', 'p values',
            '%g%% confidence interval' % (confi_level * 100)
        ]
        rb.addMD(
            strip_margin("""
        | #### Data = {response_col} by {factor_col}({first},{second})
        | - Estimates= {f_value}
        |
        | {result_model}
        |
        """.format(response_col=response_col,
                   factor_col=factor_col,
                   first=first,
                   second=second,
                   f_value=f_value,
                   result_model=pandasDF2MD(result_model))))

    result = pd.DataFrame.from_records(tmp_table)
    result.columns = [
        'data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value',
        'confidence_level', 'lower_confidence_interval',
        'upper_confidence_interval'
    ]

    model = dict()
    model['report'] = rb.get()
    return {'out_table': result, 'model': model}
Example #20
0
def _paired_ttest(table,
                  first_column,
                  second_column,
                  alternative=['greater', 'less', 'twosided'],
                  hypothesized_difference=0,
                  confidence_level=0.95):

    df = len(table) - 1
    first_col = table[first_column]
    second_col = table[second_column]

    diff_mean = (first_col - second_col).mean()
    std_dev = np.std(first_col - second_col)
    t_value = stats.ttest_rel(table[first_column], table[second_column] +
                              hypothesized_difference)[0]

    result = []
    alternative_hypothesis = []
    p_value = []
    confidence_interval = []

    if 'greater' in alternative:
        alternative_hypothesis.append('true difference in means > ' +
                                      str(hypothesized_difference))
        p_value.append(stats.t.sf(t_value, df))
        confidence_interval.append((diff_mean - std_dev * stats.t.isf(
            (1 - confidence_level), df) / np.sqrt(df), np.Infinity))

    if 'less' in alternative:
        alternative_hypothesis.append('true difference in means < ' +
                                      str(hypothesized_difference))
        p_value.append(stats.t.cdf(t_value, df))
        confidence_interval.append(
            (-np.Infinity, diff_mean + std_dev * stats.t.isf(
                (1 - confidence_level), df) / np.sqrt(df)))

    if 'twosided' in alternative:
        alternative_hypothesis.append('true difference in means != ' +
                                      str(hypothesized_difference))
        p_value.append(
            stats.ttest_rel(first_col,
                            second_col + hypothesized_difference)[1])
        other_term = std_dev * stats.t.isf(
            (1 - confidence_level) / 2, df) / np.sqrt(df)
        confidence_interval.append(
            (diff_mean - other_term, diff_mean + other_term))

    result.append(['alternative hypothesis', alternative_hypothesis])
    result.append(['t-value', t_value])
    result.append(['p-value', p_value])
    result.append([
        '%g%% confidence Interval' % (confidence_level * 100),
        confidence_interval
    ])
    result_table = pd.DataFrame.from_items(result)

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    |## Paired T Test Result
    |##### df : {deg_f}
    |##### Mean of differences : {dm}
    |##### Standard deviation : {sd}
    |
    |{result_table}
    |
    """.format(deg_f=df,
               dm=diff_mean,
               sd=std_dev,
               result_table=pandasDF2MD(result_table))))

    model = dict()
    model['report'] = rb.get()

    return {'model': model}
Example #21
0
def _one_sample_ttest(table,
                      input_cols,
                      alternatives,
                      hypothesized_mean=0,
                      conf_level=0.95):

    cols = [
        'data', 'alternative_hypothesis', 'statistics', 't_value', 'p_value',
        'confidence_level', 'lower_confidence_interval',
        'upper_confidence_interval'
    ]
    out_table = pd.DataFrame(columns=cols)
    n = len(table)
    alpha = 1.0 - conf_level
    statistics = "t statistic, t distribution with %d degrees of freedom under the null hypothesis." % (
        n - 1)

    # ## Build model
    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    ## One Sample T Test Result
    | - Statistics = {s}
    | - Hypothesized mean = {h} 
    | - Confidence level = {cl}
    """.format(s=statistics, h=hypothesized_mean, cl=conf_level)))

    for input_col in input_cols:

        col = table[input_col]

        H1_list = []
        p_list = []
        CI_list = []

        # width of the confidence interval
        width_one_sided = _width(col, alpha, n)
        width_two_sided = _width(col, alpha / 2, n)

        # t-statistic, two-tailed p-value
        t_value, p_value_two = stats.ttest_1samp(col, hypothesized_mean)

        # one-tailed p-value for Greater
        if t_value >= 0:
            p_value_one = p_value_two / 2
        else:
            p_value_one = 1.0 - p_value_two / 2

        for alter in alternatives:
            if alter == 'Greater':
                H1 = 'true mean > {hypothesized_mean}'.format(
                    hypothesized_mean=hypothesized_mean)
                p_value = p_value_one
                lower_conf_interval = np.mean(col) - width_one_sided
                upper_conf_interval = np.inf

            if alter == 'Less':
                H1 = 'true mean < {hypothesized_mean}'.format(
                    hypothesized_mean=hypothesized_mean)
                p_value = 1.0 - p_value_one
                lower_conf_interval = -np.inf
                upper_conf_interval = np.mean(col) + width_one_sided

            if alter == 'Two Sided':
                H1 = 'true mean != {hypothesized_mean}'.format(
                    hypothesized_mean=hypothesized_mean)
                p_value = p_value_two
                lower_conf_interval = np.mean(col) - width_two_sided
                upper_conf_interval = np.mean(col) + width_two_sided

            # ## Build out_table
            out = pd.Series([
                input_col, H1, statistics, t_value, p_value, conf_level,
                lower_conf_interval, upper_conf_interval
            ],
                            index=cols)
            out_table = out_table.append(out, ignore_index=True)

            # ## Build model
            H1_list.append(H1)
            p_list.append(p_value)
            CI_list.append(
                '({lower_conf_interval}, {upper_conf_interval})'.format(
                    lower_conf_interval=lower_conf_interval,
                    upper_conf_interval=upper_conf_interval))

        # ## Build model
        result_table = pd.DataFrame.from_items(
            [['alternative hypothesis', H1_list], ['p-value', p_list],
             ['%g%% confidence Interval' % (conf_level * 100), CI_list]])
        rb.addMD(
            strip_margin("""
        ### Data = {input_col}
        | - t-value = {t_value} 
        |
        | {result_table}
        """.format(input_col=input_col,
                   t_value=t_value,
                   result_table=pandasDF2MD(result_table))))

    model = dict()
    model['report'] = rb.get()

    return {'out_table': out_table, 'model': model}
Example #22
0
def _xgb_classification_train(table,
                              feature_cols,
                              label_col,
                              max_depth=3,
                              learning_rate=0.1,
                              n_estimators=100,
                              silent=True,
                              objective='binary:logistic',
                              booster='gbtree',
                              n_jobs=1,
                              nthread=None,
                              gamma=0,
                              min_child_weight=1,
                              max_delta_step=0,
                              subsample=1,
                              colsample_bytree=1,
                              colsample_bylevel=1,
                              reg_alpha=0,
                              reg_lambda=1,
                              scale_pos_weight=1,
                              base_score=0.5,
                              random_state=0,
                              seed=None,
                              missing=None,
                              sample_weight=None,
                              eval_set=None,
                              eval_metric=None,
                              early_stopping_rounds=None,
                              verbose=True,
                              xgb_model=None,
                              sample_weight_eval_set=None):
    classifier = XGBClassifier(max_depth, learning_rate, n_estimators, silent,
                               objective, booster, n_jobs, nthread, gamma,
                               min_child_weight, max_delta_step, subsample,
                               colsample_bytree, colsample_bylevel, reg_alpha,
                               reg_lambda, scale_pos_weight, base_score,
                               random_state, seed, missing)
    classifier.fit(table[feature_cols], table[label_col], sample_weight,
                   eval_set, eval_metric, early_stopping_rounds, verbose,
                   xgb_model, sample_weight_eval_set)

    # json
    get_param = classifier.get_params()
    feature_importance = classifier.feature_importances_
    #     plt.rcdefaults()
    plot_importance(classifier)
    plt.tight_layout()
    fig_plot_importance = plt2MD(plt)
    plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(classifier)
    #     fig_plot_tree_UT = plt2MD(plt)
    #     plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(classifier, rankdir='LR')
    #     fig_plot_tree_LR = plt2MD(plt)
    #     plt.rcdefaults()
    #     plt.clf()

    model = _model_dict('xgb_classification_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    model['parameters'] = get_param
    model['feature_importance'] = feature_importance
    model['classifier'] = classifier

    # report
    #     get_param_list = []
    #     get_param_list.append(['feature_cols', feature_cols])
    #     get_param_list.append(['label_col', label_col])

    params = dict2MD(get_param)
    #     for key, value in get_param.items():
    #         temp = [key, value]
    #         get_param_list.append(temp)
    #     get_param_df = pd.DataFrame(data=get_param_list, columns=['parameter', 'value'])
    feature_importance_df = pd.DataFrame(data=feature_importance,
                                         index=feature_cols).T

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## XGB Classification Train Result
    |
    | ### Plot Importance
    | {fig_importance}
    |
    | ### Feature Importance
    | {table_feature_importance}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_importance=fig_plot_importance,
               table_feature_importance=pandasDF2MD(feature_importance_df, 20),
               list_parameters=params)))
    model['report'] = rb.get()

    return {'model': model}