Exemple #1
0
def _profile_table(table, bins=10, check_correlation=False, correlation_threshold=0.9, correlation_overrides=None):
    rb = ReportBuilder()
    
    profile = pd_profiling.ProfileReport(table, bins=bins, check_correlation=check_correlation, correlation_threshold=correlation_threshold, correlation_overrides=correlation_overrides)
    rb.addHTML(profile.html)
    summary = dict()
    summary['report'] = rb.get()
    
    return {'result': summary}
Exemple #2
0
def tukeys_range_test(table, response_cols, factor_col, alpha=0.05):
    rb = ReportBuilder()
    rb.addMD("""## Tukey's range test Result""")

    for response_col in response_cols:
        data = table[response_col]
        posthoc = pairwise_tukeyhsd(data, table[factor_col], alpha=alpha)
        posthoc_html = posthoc._results_table.as_html()
        posthoc.plot_simultaneous()

        rb.addMD("""### {response_col}""".format(response_col=response_col))
        rb.addHTML(posthoc_html)
        rb.addPlt(plt)
        plt.clf()

    return {'result': {'report': rb.get()}}
Exemple #3
0
 def default(self, o):
     # TODO add more support types
     if isinstance(o, set):
         return {'__set__': list(o)}
     elif isinstance(o, numpy.ndarray):
         return {'__numpy__': _to_default_list(o)}
     elif hasattr(o, '_repr_html_'):
         rb = ReportBuilder()
         rb.addHTML(o._repr_html_())
         return {'report': rb.get(), '__pickled__': list(pickle.dumps(o))}
     elif hasattr(o, 'savefig'):
         rb = ReportBuilder()
         rb.addPlt(o)
         return {'report': rb.get(), '__pickled__': list(pickle.dumps(o))}
     else:
         rb = ReportBuilder()
         rb.addRawTextMD(str(o))
         return {'report': rb.get(), '__pickled__': list(pickle.dumps(o))}
Exemple #4
0
def _linear_regression_train(table, feature_cols, label_col, fit_intercept=True):
    features = table[feature_cols]
    label = table[label_col]
    lr_model = LinearRegression(fit_intercept)
    lr_model.fit(features, label)

    predict = lr_model.predict(features)
    residual = label - predict

    if fit_intercept == True:
        lr_model_fit = sm.OLS(label, sm.add_constant(features)).fit()
    else:
        lr_model_fit = sm.OLS(label, features).fit()
    
    summary = lr_model_fit.summary()
    summary_tables = simple_tables2df_list(summary.tables)
    summary0 = summary_tables[0]
    summary1 = summary_tables[1]
    summary2 = summary_tables[2]
    
    html_result = summary.as_html()

    plt.figure()
    plt.scatter(predict, label)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Actual values for ' + label_col)
    x = predict
    y = np.array(label)
    a = x.size
    b = np.sum(x)
    c = b
    d = 0
    for i in x: d += +i * i
    e = np.sum(y)
    f = 0
    for i in range(0, x.size - 1): f += x[i] * y[i]
    det = a * d - b * c
    aa = (d * e - b * f) / det
    bb = (a * f - c * e) / det
    p1x = np.min(x)
    p1y = aa + bb * p1x
    p2x = np.max(x)
    p2y = aa + bb * p2x
    plt.plot([p1x, p2x], [p1y, p2y], 'r--')
    fig_actual_predict = plt2MD(plt)

    plt.figure()
    plt.scatter(predict, residual)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Residuals')
    plt.axhline(y=0, color='r', linestyle='--')
    fig_residual_1 = plt2MD(plt)

    plt.figure()
    sm.qqplot(residual, line='s')
    plt.ylabel('Residuals')
    fig_residual_2 = plt2MD(plt)

    plt.figure()
    sns.distplot(residual)
    plt.xlabel('Residuals')
    fig_residual_3 = plt2MD(plt)

    rb = ReportBuilder()
    rb.addMD(strip_margin("""
    | ## Linear Regression Result
    | ### Summary
    |
    """))
    rb.addHTML(html_result)
    rb.addMD(strip_margin("""
    |
    | ### Predicted vs Actual
    | {image1}
    |
    | ### Fit Diagnostics
    | {image2}
    | {image3}
    | {image4}
    """.format(image1=fig_actual_predict,
               image2=fig_residual_1,
               image3=fig_residual_2,
               image4=fig_residual_3
               )))

    model = _model_dict('linear_regression_model')
    model['features'] = feature_cols
    model['label'] = label_col
    model['coefficients'] = lr_model_fit.params
    model['r2'] = lr_model_fit.rsquared
    model['adjusted_r2'] = lr_model_fit.rsquared_adj
    model['aic'] = lr_model_fit.aic
    model['bic'] = lr_model_fit.bic
    model['f_static'] = lr_model_fit.fvalue
    model['tvalues'] = lr_model_fit.tvalues
    model['pvalues'] = lr_model_fit.pvalues
    model['lr_model'] = lr_model
    model['report'] = rb.get()
    
    model['summary0'] = summary0
    model['summary1'] = summary1
    model['summary2'] = summary2
    
    return {'model' : model}
Exemple #5
0
def _glm_train(table, feature_cols, label_col, family="Gaussian", link="ident", fit_intercept=True):
    features = table[feature_cols]
    label = table[label_col]

    if label_col in feature_cols:
        raise Exception("%s is duplicated." % label_col)

    if family == "Gaussian": 
        sm_family = sm.families.Gaussian()
    elif family == "inv_Gaussian":
        sm_family = sm.families.InverseGaussian()
    elif family == "binomial":
        sm_family = sm.families.Binomial()
    elif family == "Poisson":
        sm_family = sm.families.Poisson()
    elif family == "neg_binomial":
        sm_family = sm.families.NegativeBinomial()
    elif family == "gamma":
        sm_family = sm.families.Gamma()
    elif family == "Tweedie":
        sm_family = sm.families.Tweedie()

    if link == "ident":
        sm_link = sm.families.links.identity
    elif link == "log":
        sm_link = sm.families.links.log
    elif link == "logit":
        sm_link = sm.families.links.logit
    elif link == "probit":
        sm_link = sm.families.links.probit
    elif link == "cloglog":
        sm_link = sm.families.links.cLogLog
    elif link == "pow":
        sm_link = sm.families.links.Power
    elif link == "nbinom":
        sm_link = sm.families.links.binom

    if fit_intercept == True:
        glm_model = sm.GLM(label, sm.add_constant(features), family=sm_family, link=sm_link).fit()
    else:
        glm_model = sm.GLM(label, features, family=sm_family, link=sm_link).fit()
    summary = glm_model.summary().as_html()

    rb = ReportBuilder()
    rb.addMD(strip_margin("""
    | ## GLM Result
    | ### Summary
    |
    """))
    rb.addHTML(summary)

    model = _model_dict('glm_model')
    model['features'] = feature_cols
    model['label'] = label_col
    model['family'] = family
    model['link'] = link
    model['coefficients'] = glm_model.params
    model['aic'] = glm_model.aic
    model['bic'] = glm_model.bic
    model['tvalues'] = glm_model.tvalues
    model['pvalues'] = glm_model.pvalues
    model['fit_intercept'] = fit_intercept
    model['glm_model'] = glm_model
    model['report'] = rb.get()

    return {'model' : model}
Exemple #6
0
                return new_dict
            else:
                return item

        return super(DefaultEncoder, self).encode(hint_tuples(obj))

    def default(self, o):
<<<<<<< HEAD
        # TODO add more support types
        if isinstance(o, set):
            return {'__set__': list(o)}
        elif isinstance(o, numpy.ndarray):
            return {'__numpy__': _to_default_list(o)}
        elif hasattr(o, '_repr_html_'):
            rb = ReportBuilder()
            rb.addHTML(o._repr_html_())
            return {'report': rb.get(), '__pickled__': list(pickle.dumps(o))}
        elif hasattr(o, 'savefig'):
            rb = ReportBuilder()
            rb.addPlt(o)
            return {'report': rb.get(), '__pickled__': list(pickle.dumps(o))}
        else:
            rb = ReportBuilder()
            rb.addRawTextMD(str(o))
            return {'report': rb.get(), '__pickled__': list(pickle.dumps(o))}
=======
        if isinstance(o, set):
            return {'__set__': list(o)}
        elif isinstance(o, numpy.ndarray):
            return {'__numpy__': o.tolist()}
        # TODO add more support types