def doctovec_similar_sentence(table, model, text_col, label_col): df = table.copy() result_sim = {} for i in range(10): temp = {} temp['sentence'] = [] temp['label'] = [] for id, vec in model.docvecs.most_similar(i): temp['sentence'].append(df.at[id, text_col]) temp['label'].append(df.at[id, label_col]) result_sim[i] = pd.DataFrame(temp) str_MD = '## Most similar sentences \n' for i in range(10): str_MD += '|' + df.at[i, 'document'] + '\n' str_MD += '|' + pandasDF2MD(result_sim[i]) + '\n' rb = ReportBuilder() rb.addMD(strip_margin(str_MD)) _model = _model_dict('doc2vec') _model['report'] = rb.get() return {'model': _model}
def bartletts_test(table, response_cols, factor_col): groups = table[factor_col].unique() data_list = [] stat_list = [] p_list = [] for response_col in response_cols: response = table[response_col] stat_bart, p_bart = bartlett(*[response[table[factor_col] == group] for group in groups]) data = '{response_col} by {factor_col}'.format(response_col=response_col, factor_col=factor_col) data_list.append(data) stat_list.append(stat_bart) p_list.append(p_bart) result_table = pd.DataFrame.from_items([ ['data', data_list], ['estimate', stat_list], ['p_value', p_list] ]) result = dict() result['result_table'] = result_table rb = ReportBuilder() rb.addMD(strip_margin(""" ## Bartlett's Test Result | - H0: k population variances are equal. | - H1: at least two variances are different. | | {result_table} """.format(result_table=pandasDF2MD(result_table)))) result['report'] = rb.get() return {'result': result}
def _oneway_anova(table, response_cols, factor_col): rb = ReportBuilder() rb.addMD(strip_margin(""" ## One-way Analysis of Variance Result """)) groups = table[factor_col].unique() groups.sort() sum_len = np.sum([ len(str(group)) for group in groups ]) result = dict() result['_grouped_data'] = dict() for response_col in response_cols: data = table[response_col] result['_grouped_data'][response_col] = dict() ax = sns.boxplot(x=factor_col, y=response_col, data=table, order=groups) if sum_len > 512: ax.set_xticklabels(ax.get_xticklabels(), rotation=90) elif sum_len > 64: ax.set_xticklabels(ax.get_xticklabels(), rotation=45) fig_box = plt2MD(plt) plt.clf() model = ols("""Q('{response_col}') ~ C(Q('{factor_col}'))""".format(response_col=response_col, factor_col=factor_col), table).fit() # TODO factor_col = class => error anova = anova_lm(model) anova_df = pandasDF2MD(anova) p_value = anova["""PR(>F)"""][0] residual = model.resid sns.distplot(residual) distplot = plt2MD(plt) plt.clf() sm.qqplot(residual, line='s') qqplot = plt2MD(plt) plt.clf() rb.addMD(strip_margin(""" | ## {response_col} by {factor_col} | {fig_box} | | ### ANOVA | {anova_df} | | ### Diagnostics | {distplot} | | {qqplot} """.format(response_col=response_col, factor_col=factor_col, fig_box=fig_box, anova_df=anova_df, distplot=distplot, qqplot=qqplot))) result['_grouped_data'][response_col]['p_value'] = p_value result['report'] = rb.get() return {'result': result}
def _evaluate_regression(table, label_col, prediction_col): label = table[label_col] predict = table[prediction_col] # compute metrics evs = explained_variance_score(label, predict) mae = mean_absolute_error(label, predict) mse = mean_squared_error(label, predict) mdae = median_absolute_error(label, predict) r2 = r2_score(label, predict) # json summary = dict() summary['label_col'] = label_col summary['prediction_col'] = prediction_col summary['r2_score'] = r2 summary['mean_squared_error'] = mse summary['mean_absolute_error'] = mae summary['median_absolute_error'] = mdae summary['explained_variance_score'] = evs # report all_dict_list = [{ 'r2_score': r2, 'mean_squared_error': mse, 'mean_absolute_error': mae, 'median_absolute_error': mdae, 'explained_variance_score': evs }] all_df = pd.DataFrame(all_dict_list) all_df = all_df[[ 'r2_score', 'mean_squared_error', 'mean_absolute_error', 'median_absolute_error', 'explained_variance_score' ]] summary['metrics'] = all_df rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Evaluate Regression Result | ### Metrics | {table1} | | """.format(table1=pandasDF2MD(all_df)))) summary['report'] = rb.get() return {'result': summary}
def _hierarchical_clustering_post(table, model, num_clusters, cluster_col='prediction'): Z = model['model'] input_cols = model['input_cols'] params = model['parameters'] out_table = model['outtable'] predict = fcluster(Z, t=num_clusters, criterion='maxclust') out_table2 = table.copy() out_table2[cluster_col] = predict L, M = leaders(Z, predict) which_cluster = [] for leader in L: if leader in Z[:, 0]: select_indices = np.where(Z[:, 0] == leader)[0][0] which_cluster.append(out_table['joined_column1'][select_indices]) elif leader in Z[:, 1]: select_indices = np.where(Z[:, 1] == leader)[0][0] which_cluster.append(out_table['joined_column2'][select_indices]) out_table3 = pd.DataFrame([]) out_table3[cluster_col] = M out_table3['name_of_clusters'] = which_cluster out_table3 = out_table3.sort_values(cluster_col) cluster_count = np.bincount(out_table2[cluster_col]) cluster_count = cluster_count[cluster_count != 0] # data = {'cluster_name': ['prediction' + str(i) for i in range(1, num_clusters + 1)]} out_table3['num_of_entities'] = list(cluster_count) rb = ReportBuilder() rb.addMD(strip_margin("""### Hierarchical Clustering Post Process Result""")) rb.addMD(strip_margin(""" |### Parameters | |{display_params} | |## Clusters Information | |{out_table3} | """.format(display_params=dict2MD(params), out_table3=pandasDF2MD(out_table3)))) model = _model_dict('hierarchical_clustering_post') model['report'] = rb.get() return {'out_table2' : out_table2, 'model': model}
def tfidf_train(table, tokens_col, tf_weighing='n', df_weighing='t', document_normalization='c'): out_table = table.copy() _corpus = out_table[tokens_col] _smartirs = tf_weighing + df_weighing + document_normalization _dictionary = Dictionary(_corpus) _corpus = [_dictionary.doc2bow(text) for text in _corpus] _model = TfidfModel(_corpus, smartirs=_smartirs) _corpus = [text for text in _model[_corpus]] _sparse_matrix = corpus2csc(_corpus, num_terms=len(_dictionary.token2id)).T _values = [value for value in _dictionary.values()] _keys = [key for key in _dictionary.keys()] _dic = pd.DataFrame({'indice': _keys, 'word': _values}) rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Dictionary | {table1} """.format(table1=pandasDF2MD(_dic)))) out_table['sparse_vectors'] = sparse_encode( _sparse_matrix)['sparse_vectors'] fit_model = dict() fit_model['dictionary'] = _dictionary fit_model['model'] = _model fit_model['report'] = rb.get() return {'out_table': out_table, 'fit_model': fit_model}
def _pca(table, input_cols, new_column_name='projected_', n_components=None, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None, hue=None, alpha=0, key_col=None): num_feature_cols = len(input_cols) if n_components is None: n_components = num_feature_cols pca = PCA(None, copy, whiten, svd_solver, tol, iterated_power, random_state) pca_model = pca.fit(table[input_cols]) column_names = [] for i in range(0, n_components): column_names.append(new_column_name + str(i)) # print(column_names) pca_result = pca_model.transform(table[input_cols]) out_df = pd.DataFrame(data=pca_result[:, :n_components], columns=[column_names]) out_df = pd.concat([table.reset_index(drop=True), out_df], axis=1) out_df.columns = table.columns.values.tolist() + column_names res_components = pca_model.components_ res_components_df = pd.DataFrame(data=res_components[:n_components], columns=[input_cols]) res_explained_variance = pca_model.explained_variance_ res_explained_variance_ratio = pca_model.explained_variance_ratio_ res_singular_values = pca_model.singular_values_ res_mean = pca_model.mean_ res_n_components = pca_model.n_components_ res_noise_variance = pca_model.noise_variance_ res_get_param = pca_model.get_params() res_get_covariance = pca_model.get_covariance() res_get_precision = pca_model.get_precision() # visualization plt.figure() if n_components == 1: sns.scatterplot(column_names[0], column_names[0], hue=hue, data=out_df) plt_two = plt2MD(plt) plt.clf() else: plt_two = _biplot( 0, 1, pc_columns=column_names, columns=input_cols, singular_values=res_singular_values, components=res_components, explained_variance_ratio=res_explained_variance_ratio, alpha=alpha, hue=hue, data=out_df, ax=plt.gca(), key_col=key_col) plt.figure() fig_scree = _screeplot(res_explained_variance, res_explained_variance_ratio, n_components) table_explained_variance = pd.DataFrame(res_explained_variance, columns=['explained_variance']) table_explained_variance[ 'explained_variance_ratio'] = res_explained_variance_ratio table_explained_variance[ 'cum_explained_variance_ratio'] = res_explained_variance_ratio.cumsum( ) rb = ReportBuilder() rb.addMD( strip_margin(""" | ## PCA Result | ### Plot | {image1} | | ### Explained Variance | {fig_scree} | {table_explained_variance} | | ### Components | {table2} | | ### Parameters | {parameter1} """.format(image1=plt_two, fig_scree=fig_scree, table_explained_variance=pandasDF2MD(table_explained_variance), parameter1=dict2MD(res_get_param), table2=pandasDF2MD(res_components_df)))) model = _model_dict('pca') model['components'] = res_components model['explained_variance'] = res_explained_variance model['explained_variance_ratio'] = res_explained_variance_ratio model['singular_values'] = res_singular_values model['mean'] = res_mean model['n_components'] = res_n_components model['noise_variance'] = res_noise_variance model['parameters'] = res_get_param model['covariance'] = res_get_covariance model['precision'] = res_get_precision model['report'] = rb.get() model['pca_model'] = pca_model model['input_cols'] = input_cols return {'out_table': out_df, 'model': model}
def _logistic_regression_train(table, feature_cols, label_col, penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1): features = table[feature_cols] label = table[label_col] lr_model = LogisticRegression(penalty, dual, tol, C, fit_intercept, intercept_scaling, class_weight, random_state, solver, max_iter, multi_class, verbose, warm_start, n_jobs) lr_model.fit(features, label) featureNames = np.append("Intercept", feature_cols) intercept = lr_model.intercept_ coefficients = lr_model.coef_ classes = lr_model.classes_ is_binary = len(classes) == 2 if (fit_intercept == True): summary = pd.DataFrame({'features': ['intercept'] + feature_cols}) print(intercept) print(coefficients) coef_trans = np.concatenate(([intercept], np.transpose(coefficients)), axis=0) if not is_binary: summary = pd.concat( (summary, pd.DataFrame(coef_trans, columns=classes)), axis=1) elif is_binary: summary = pd.concat( (summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1) else: summary = pd.DataFrame({'features': feature_cols}) coef_trans = np.transpose(coefficients) if not is_binary: summary = pd.concat( (summary, pd.DataFrame(coef_trans, columns=classes)), axis=1) elif is_binary: summary = pd.concat( (summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1) prob = lr_model.predict_proba(features) rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Logistic Regression Result | ### Summary | {table1} """.format(table1=pandasDF2MD(summary)))) model = dict() model['features'] = feature_cols model['label'] = label_col model['intercept'] = lr_model.intercept_ model['coefficients'] = lr_model.coef_ model['class'] = lr_model.classes_ model['penalty'] = penalty model['solver'] = solver model['lr_model'] = lr_model model['report'] = rb.get() return {'model': model}
def _pca(table, input_cols, new_column_name='projected_', n_components=None, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None): num_feature_cols = len(input_cols) if n_components is None: n_components = num_feature_cols pca = PCA(n_components, copy, whiten, svd_solver, tol, iterated_power, random_state) pca_model = pca.fit(table[input_cols]) column_names = [] for i in range(0, n_components): column_names.append(new_column_name + str(i)) # print(column_names) pca_result = pca_model.transform(table[input_cols]) out_df = pd.DataFrame(data=pca_result, columns=[column_names]) res_components = pca_model.components_ res_components_df = pd.DataFrame(data=res_components, columns=[input_cols]) res_explained_variance = pca_model.explained_variance_ res_explained_variance_ratio = pca_model.explained_variance_ratio_ res_singular_values = pca_model.singular_values_ res_mean = pca_model.mean_ res_n_components = pca_model.n_components_ res_noise_variance = pca_model.noise_variance_ res_get_param = pca_model.get_params() res_get_covariance = pca_model.get_covariance() res_get_precision = pca_model.get_precision() # visualization plt.figure() if res_n_components == 1: plt.scatter(pca_result[:, 0], pca_result[:, 0]) else: plt.scatter(pca_result[:, 0], pca_result[:, 1]) # plt.title('PCA result with two components') # plt.show() plt_two = plt2MD(plt) plt.clf() rb = ReportBuilder() rb.addMD( strip_margin(""" | | ### Plot | The x-axis and y-axis of the following plot is projected0 and projected1, respectively. | {image1} | | ### Result | {table1} | only showing top 20 rows | | ### Parameters | {parameter1} | | ### Components | {table2} | | ### Mean | {array1} | | ### Explained Variance | {array2} | """.format(table1=pandasDF2MD(out_df, 20), image1=plt_two, parameter1=dict2MD(res_get_param), table2=pandasDF2MD(res_components_df), array1=res_mean, array2=res_explained_variance))) model = _model_dict('pca') model['components'] = res_components model['explained_variance'] = res_explained_variance model['explained_variance_ratio'] = res_explained_variance_ratio model['singular_values'] = res_singular_values model['mean'] = res_mean model['n_components'] = res_n_components model['noise_variance'] = res_noise_variance model['parameters'] = res_get_param model['covariance'] = res_get_covariance model['precision'] = res_get_precision model['report'] = rb.get() model['pca_model'] = pca_model model['input_cols'] = input_cols out_df = pd.concat([table.reset_index(drop=True), out_df], axis=1) out_df.columns = table.columns.values.tolist() + column_names return {'out_table': out_df, 'model': model}
def one_sample_ttest(table, input_cols, alternatives, hypothesized_mean=0, conf_level=0.95): n = len(table) degree = n - 1 alpha = 1.0 - conf_level out_table = pd.DataFrame() # statistics statistics = "t statistic, t distribution with %d degrees of freedom under the null hypothesis." % degree # Print model rb = ReportBuilder() rb.addMD( strip_margin(""" ## One Sample T Test Result | - Statistics = {s} | - Hypothesized mean = {h} | - Confidence level = {cl} """.format(s=statistics, h=hypothesized_mean, cl=conf_level))) for input_col in input_cols: # model alter_list = [] p_list = [] CI_list = [] # data data = input_col # estimates result = stats.ttest_1samp(table[input_col], hypothesized_mean) estimates = result[0] cols = [ 'data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'lower_confidence_interval', 'upper_confidence_interval' ] for i in alternatives: if (i == 'Greater'): # alternative hypothesis alternative_hypothesis = "true mean >" + str(hypothesized_mean) # p-values p_value = 1.0 - t.cdf(estimates, degree) # confidence interval - greater critical_val = t.ppf(1.0 - alpha, degree) width = critical_val * np.std( table[input_col]) / math.sqrt(n - 1) lower_conf_interval = np.mean(table[input_col]) - width upper_conf_interval = math.inf # model alter = 'true mean > {hypothesized_mean}'.format( hypothesized_mean=hypothesized_mean) alter_list.append(alter) p_list.append(p_value) conf_interval = '({lower_conf_interval}, {upper_conf_interval})'.format( lower_conf_interval=lower_conf_interval, upper_conf_interval=upper_conf_interval) CI_list.append(conf_interval) # out_table list = [] list.append([ data, alternative_hypothesis, statistics, estimates, p_value, conf_level, lower_conf_interval, upper_conf_interval ]) out_table = out_table.append(pd.DataFrame(list, columns=cols)) if (i == 'Less'): # alternative hypothesis alternative_hypothesis = "true mean <" + str(hypothesized_mean) p_value = t.cdf(estimates, degree) # confidence interval - less critical_val = t.ppf(1.0 - alpha, degree) width = critical_val * np.std( table[input_col]) / math.sqrt(n - 1) lower_conf_interval = -math.inf upper_conf_interval = np.mean(table[input_col]) + width # model alter = 'true mean < {hypothesized_mean}'.format( hypothesized_mean=hypothesized_mean) alter_list.append(alter) p_list.append(p_value) conf_interval = '({lower_conf_interval}, {upper_conf_interval})'.format( lower_conf_interval=lower_conf_interval, upper_conf_interval=upper_conf_interval) CI_list.append(conf_interval) # out_table list = [] list.append([ data, alternative_hypothesis, statistics, estimates, p_value, conf_level, lower_conf_interval, upper_conf_interval ]) out_table = out_table.append(pd.DataFrame(list, columns=cols)) if (i == 'Two Sided'): # alternative hypothesis alternative_hypothesis = "true mean !=" + str( hypothesized_mean) # p_value = (1.0 - t.cdf(abs(estimates), degree)) * 2.0 if (estimates >= 0): p_value = 2.0 * t.cdf(-estimates, degree) else: p_value = 2.0 * t.cdf(estimates, degree) # confidence interval - two-sided critical_val = t.ppf(1.0 - alpha / 2, degree) width = critical_val * np.std( table[input_col]) / math.sqrt(n - 1) lower_conf_interval = np.mean(table[input_col]) - width upper_conf_interval = np.mean(table[input_col]) + width # model alter = 'true mean != {hypothesized_mean}'.format( hypothesized_mean=hypothesized_mean) alter_list.append(alter) p_list.append(p_value) conf_interval = '({lower_conf_interval}, {upper_conf_interval})'.format( lower_conf_interval=lower_conf_interval, upper_conf_interval=upper_conf_interval) CI_list.append(conf_interval) # out_table list = [] list.append([ data, alternative_hypothesis, statistics, estimates, p_value, conf_level, lower_conf_interval, upper_conf_interval ]) out_table = out_table.append(pd.DataFrame(list, columns=cols)) # Print model conf_level_percent = conf_level * 100 result_table = pd.DataFrame.from_items( [['alternative hypothesis', alter_list], ['p-value', p_list], ['%g%% confidence Interval' % conf_level_percent, CI_list]]) result = dict() result['result_table'] = result_table rb.addMD( strip_margin(""" ### Data = {input_col} | - Estimates = {estimates} | | {result_table} """.format(input_col=input_col, estimates=estimates, result_table=pandasDF2MD(result_table)))) # print model result['report'] = rb.get() return {'out_table': out_table, 'model': result}
def two_sample_ttest_for_stacked_data(table, response_cols, factor_col, alternatives, first, second, hypo_diff=0, equal_vari='pooled', confi_level=0.95): if (type(table[factor_col][0]) == str): table_first = table[table[factor_col] == first] table_second = table[table[factor_col] == second] elif (type(table[factor_col][0]) == bool): table_first = table[table[factor_col] == bool(first)] table_second = table[table[factor_col] == bool(second)] else: table_first = table[table[factor_col] == float(first)] table_second = table[table[factor_col] == float(second)] tmp_table = [] rb = ReportBuilder() rb.addMD( strip_margin(""" ## Two Sample T Test for Stacked Data Result | - Hypothesized mean = {hypo_diff} | - Confidence level = {confi_level} """.format(hypo_diff=hypo_diff, confi_level=confi_level))) for response_col in response_cols: tmp_model = [] number1 = len(table_first[response_col]) number2 = len(table_second[response_col]) mean1 = (table_first[response_col]).mean() mean2 = (table_second[response_col]).mean() std1 = (table_first[response_col]).std() std2 = (table_second[response_col]).std() start_auto = 0 if (equal_vari == 'auto'): start_auto = 1 f_value = (std1**2) / (std2**2) f_test_p_value_tmp = stats.f.cdf(1 / f_value, number1 - 1, number2 - 1) if (f_test_p_value_tmp > 0.5): f_test_p_value = (1 - f_test_p_value_tmp) * 2 else: f_test_p_value = f_test_p_value_tmp * 2 if (f_test_p_value < 0.05): equal_vari = 'unequal' else: equal_vari = 'pooled' ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'larger', usevar=equal_vari, value=hypo_diff) if 'larger' in alternatives: ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'larger', usevar=equal_vari, value=hypo_diff) df = ttestresult[2] if (equal_vari == 'pooled'): std_number1number2 = sqrt( ((number1 - 1) * (std1)**2 + (number2 - 1) * (std2)**2) / (number1 + number2 - 2)) margin = t.ppf( (confi_level), df) * std_number1number2 * sqrt(1 / number1 + 1 / number2) if (equal_vari == 'unequal'): margin = t.ppf( (confi_level), df) * sqrt(std1**2 / (number1) + std2**2 / (number2)) tmp_model += [['true difference in means > 0.0'] + [ttestresult[1]] + [(mean1 - mean2 - margin, math.inf)]] tmp_table += [[ '%s by %s(%s,%s)' % (response_col, factor_col, first, second) ] + ['true difference in means > 0.0'] + [ 't statistic, t distribution with %f degrees of freedom under the null hypothesis' % ttestresult[2] ] + [ttestresult[0]] + [ttestresult[1]] + [confi_level] + [mean1 - mean2 - margin] + [math.inf]] if 'smaller' in alternatives: ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'smaller', usevar=equal_vari, value=hypo_diff) df = ttestresult[2] if (equal_vari == 'pooled'): std_number1number2 = sqrt( ((number1 - 1) * (std1)**2 + (number2 - 1) * (std2)**2) / (number1 + number2 - 2)) margin = t.ppf( (confi_level), df) * std_number1number2 * sqrt(1 / number1 + 1 / number2) if (equal_vari == 'unequal'): margin = t.ppf( (confi_level), df) * sqrt(std1**2 / (number1) + std2**2 / (number2)) tmp_model += [['true difference in means < 0.0'] + [ttestresult[1]] + [(-math.inf, mean1 - mean2 + margin)]] tmp_table += [[ '%s by %s(%s,%s)' % (response_col, factor_col, first, second) ] + ['true difference in means < 0.0'] + [ 't statistic, t distribution with %f degrees of freedom under the null hypothesis' % ttestresult[2] ] + [ttestresult[0]] + [ttestresult[1]] + [confi_level] + [-math.inf] + [mean1 - mean2 + margin]] if 'two-sided' in alternatives: ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'two-sided', usevar=equal_vari, value=hypo_diff) df = ttestresult[2] if (equal_vari == 'pooled'): std_number1number2 = sqrt( ((number1 - 1) * (std1)**2 + (number2 - 1) * (std2)**2) / (number1 + number2 - 2)) margin = t.ppf( (confi_level), df) * std_number1number2 * sqrt(1 / number1 + 1 / number2) if (equal_vari == 'unequal'): margin = t.ppf( (confi_level), df) * sqrt(std1**2 / (number1) + std2**2 / (number2)) tmp_model += [['true difference in means != 0.0'] + [ttestresult[1]] + [(mean1 - mean2 - margin, mean1 - mean2 + margin)]] tmp_table += [[ '%s by %s(%s,%s)' % (response_col, factor_col, first, second) ] + ['true difference in means != 0.0'] + [ 't statistic, t distribution with %f degrees of freedom under the null hypothesis' % ttestresult[2] ] + [ttestresult[0]] + [ttestresult[1]] + [confi_level] + [mean1 - mean2 - margin] + [mean1 - mean2 + margin]] result_model = pd.DataFrame.from_records(tmp_model) result_model.columns = [ 'alternatives', 'p values', '%g%% confidence interval' % (confi_level * 100) ] rb.addMD( strip_margin(""" | #### Data = {response_col} by {factor_col}({first},{second}) | - Statistics = t statistic, t distribution with {ttestresult2} degrees of freedom under the null hypothesis | - Estimates= {ttestresult0} | | {result_model} | """.format(ttestresult2=ttestresult[2], response_col=response_col, factor_col=factor_col, first=first, second=second, ttestresult0=ttestresult[0], result_model=pandasDF2MD(result_model)))) if (start_auto == 1): equal_vari = 'auto' result = pd.DataFrame.from_records(tmp_table) result.columns = [ 'data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'lower_confidence_interval', 'upper_confidence_interval' ] model = dict() model['report'] = rb.get() return {'out_table': result, 'model': model}
def _correlation(table, vars, method='pearson', height=2.5, corr_prec=2): size = len(vars) s_default = plt.rcParams['lines.markersize']**2. scatter_kws = {"s": s_default * height / 6.4} result_arr = [] for i in range(size): for j in range(i): if method == 'pearson': r, p = stats.pearsonr(table[vars[i]], table[vars[j]]) elif method == 'spearman': r, p = stats.spearmanr(table[vars[i]], table[vars[j]]) elif method == 'kendal': r, p = stats.kendalltau(table[vars[i]], table[vars[j]]) result_arr.append([vars[i], vars[j], r, p]) df_result = pd.DataFrame(result_arr, columns=['x', 'y', 'corr', 'p_value']) def corr(x, y, **kwargs): if kwargs['method'] == 'pearson': r, p = stats.pearsonr(x, y) elif kwargs['method'] == 'spearman': r, p = stats.spearmanr(x, y) elif kwargs['method'] == 'kendal': r, p = stats.kendalltau(x, y) p_stars = '' if p <= 0.05: p_stars = '*' if p <= 0.01: p_stars = '**' if p <= 0.001: p_stars = '***' corr_text = '{:.{prec}f}'.format(r, prec=corr_prec) font_size = abs(r) * 15 * 2 / corr_prec + 5 ax = plt.gca() ax.annotate(corr_text, [ .5, .5, ], xycoords="axes fraction", ha='center', va='center', fontsize=font_size * height) ax.annotate(p_stars, xy=(0.65, 0.6), xycoords=ax.transAxes, color='red', fontsize=17 * height) g = sns.PairGrid(table, vars=vars, height=height) g.map_diag(sns.distplot) if method == 'pearson': g.map_lower(sns.regplot, scatter_kws=scatter_kws) else: g.map_lower(sns.regplot, lowess=True, scatter_kws=scatter_kws) g.map_upper(corr, method=method) fig_corr = plt2MD(plt) plt.clf() rb = ReportBuilder() rb.addMD( strip_margin(""" ## Correlation Results | ### Correlation Matrix | {fig_corr} | | ### Correlation Table | {table} """.format(fig_corr=fig_corr, table=pandasDF2MD(df_result)))) params = {'vars': vars, 'method': method, 'height': height} res = dict() res['params'] = params res['corr_table'] = df_result res['report'] = rb.get() return {'result': res}
def _xgb_regression_train(table, feature_cols, label_col, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objectibe='reg:linear', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None, sample_weight_eval_set=None): regressor = XGBRegressor(max_depth, learning_rate, n_estimators, silent, objectibe, booster, n_jobs, nthread, gamma, min_child_weight, max_delta_step, subsample, colsample_bytree, colsample_bylevel, reg_alpha, reg_lambda, scale_pos_weight, base_score, random_state, seed, missing) regressor.fit(table[feature_cols], table[label_col], sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set) # json get_param = regressor.get_params() feature_importance = regressor.feature_importances_ # plt.rcdefaults() plot_importance(regressor) plt.tight_layout() fig_plot_importance = plt2MD(plt) plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(regressor) # fig_plot_tree_UT = plt2MD(plt) # plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(regressor, rankdir='LR') # fig_plot_tree_LR = plt2MD(plt) # plt.rcdefaults() # plt.clf() out_model = _model_dict('xgb_regression_model') out_model['feature_cols'] = feature_cols out_model['label_col'] = label_col out_model['parameters'] = get_param out_model['feature_importance'] = feature_importance out_model['regressor'] = regressor out_model['plot_importance'] = fig_plot_importance # out_model['plot_tree_UT'] = fig_plot_tree_UT # out_model['plot_tree_LR'] = fig_plot_tree_LR # out_model['to_graphviz'] = md_to_graphviz # report get_param_list = [] get_param_list.append(['feature_cols', feature_cols]) get_param_list.append(['label_col', label_col]) for key, value in get_param.items(): temp = [key, value] get_param_list.append(temp) get_param_df = pd.DataFrame(data=get_param_list, columns=['parameter', 'value']) feature_importance_df = pd.DataFrame(data=feature_importance, index=feature_cols).T rb = ReportBuilder() rb.addMD( strip_margin(""" | ## XGB Regression Result | | ### Plot Importance | {image_importance} | | ### Feature Importance | {table_feature_importance} | | ### Parameters | {table_parameter} | """.format(image_importance=fig_plot_importance, table_feature_importance=pandasDF2MD(feature_importance_df, 20), table_parameter=pandasDF2MD(get_param_df)))) out_model['report'] = rb.get() return {'model': out_model}
def _evaluate_classification(table, label_col, prediction_col): label = table[label_col] predict = table[prediction_col] # compute metrics accuracy = accuracy_score(label, predict) f1 = f1_score(label, predict, average="weighted") precision = precision_score(label, predict, average="weighted") recall = recall_score(label, predict, average="weighted") class_names = np.unique(np.union1d(label.values, predict.values)) # Plot non-normalized confusion matrix plt.figure() _plot_confusion_matrix(label, predict, classes=class_names, title='Confusion matrix, without normalization') fig_cnf_matrix = plt2MD(plt) # Plot normalized confusion matrix plt.figure() _plot_confusion_matrix(label, predict, classes=class_names, normalize=True, title='Normalized confusion matrix') fig_cnf_matrix_normalized = plt2MD(plt) plt.clf() # json summary = dict() summary['label_col'] = label_col summary['prediction_col'] = prediction_col summary['f1_score'] = f1 summary['accuracy_score'] = accuracy summary['precision_score'] = precision summary['recall_score'] = recall # report all_dict_list = [{ 'f1': f1, 'accuracy': accuracy, 'precision': precision, 'recall': recall }] all_df = pd.DataFrame(all_dict_list) all_df = all_df[['f1', 'accuracy', 'precision', 'recall']] summary['metrics'] = all_df rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Evaluate Classification Result | ### Metrics | {table1} | | ### Confusion matrix | {fig_confusion_matrix} | | {fig_confusion_matrix_normalized} | """.format(table1=pandasDF2MD(all_df), fig_confusion_matrix=fig_cnf_matrix, fig_confusion_matrix_normalized=fig_cnf_matrix_normalized))) summary['report'] = rb.get() return {'result': summary}
def paired_ttest(table, first_column, second_column, alternative, hypothesized_difference=0, confidence_level=0.95): df = len(table) - 1 diff_mean = (table[first_column] - table[second_column]).mean() std_dev = np.std(table[first_column] - table[second_column]) t_value = stats.ttest_rel(table[first_column], table[second_column] + hypothesized_difference)[0] p_value_ul = stats.ttest_rel(table[first_column], table[second_column] + hypothesized_difference)[1] p_value_u = stats.t.sf(t_value, df) p_value_l = stats.t.cdf(t_value, df) left_u = diff_mean - std_dev * stats.t.isf((1 - confidence_level), df) / np.sqrt(df) right_l = diff_mean + std_dev * stats.t.isf((1 - confidence_level), df) / np.sqrt(df) left_ul = diff_mean - std_dev * stats.t.isf((1 - confidence_level) / 2, df) / np.sqrt(df) right_ul = diff_mean + std_dev * stats.t.isf((1 - confidence_level) / 2, df) / np.sqrt(df) result_value_u = [{'data' : first_column + " , " + second_column, 'alternative_hypothesis' : "true difference in means > " + str(hypothesized_difference), 'statistics' : "t statistics, t distribution with " + str(df) + " degrees of freedom under the null hypothesis", 'estimates' : t_value, 'p_value' : p_value_u, 'confidence_level' : confidence_level, 'low_confidence_interval' : left_u, 'upper_confidence_interval' : np.Infinity}] result_value_l = [{'data' : first_column + " , " + second_column, 'alternative_hypothesis' : "true difference in means < " + str(hypothesized_difference), 'statistics' : "t statistics, t distribution with " + str(df) + " degrees of freedom under the null hypothesis", 'estimates' : t_value, 'p_value' : p_value_l, 'confidence_level' : confidence_level, 'low_confidence_interval' :-np.Infinity, 'upper_confidence_interval' : right_l}] result_value_ul = [{'data' : first_column + " , " + second_column, 'alternative_hypothesis' : "true difference in means != " + str(hypothesized_difference), 'statistics' : "t statistics, t distribution with " + str(df) + " degrees of freedom under the null hypothesis", 'estimates' : t_value, 'p_value' : p_value_ul, 'confidence_level' : confidence_level, 'low_confidence_interval' : left_ul, 'upper_confidence_interval' : right_ul}] df_result = pd.DataFrame() df_u = pd.DataFrame(result_value_u, columns=['data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'low_confidence_interval', 'upper_confidence_interval']) df_l = pd.DataFrame(result_value_l, columns=['data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'low_confidence_interval', 'upper_confidence_interval']) df_ul = pd.DataFrame(result_value_ul, columns=['data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'low_confidence_interval', 'upper_confidence_interval']) if 'greater' in alternative: df_result = df_result.append(df_u, ignore_index=True) if 'less' in alternative: df_result = df_result.append(df_l, ignore_index=True) if 'twosided' in alternative: df_result = df_result.append(df_ul, ignore_index=True) result_table_ul = pd.DataFrame([{'Alternative': 'Two Sided', 'H1': 'true difference in means != ' + str(hypothesized_difference), 't_value': t_value, 'p_value': p_value_ul, str(confidence_level * 100) + '% confidence interval': '(' + str(left_ul) + ', ' + str(right_ul) + ')'}]) result_table_u = pd.DataFrame([{'Alternative': 'Greater', 'H1': 'true difference in means > ' + str(hypothesized_difference), 't_value': t_value, 'p_value': p_value_u, str(confidence_level * 100) + '% confidence interval': '(' + str(left_u) + ', ' + str(np.Infinity) + ')'}]) result_table_l = pd.DataFrame([{'Alternative': 'Less', 'H1': 'true difference in means < ' + str(hypothesized_difference), 't_value': t_value, 'p_value': p_value_l, str(confidence_level * 100) + '% confidence interval': '(' + str(-np.Infinity) + ', ' + str(right_l) + ')'}]) result_table = pd.DataFrame() if 'greater' in alternative: result_table = result_table.append(result_table_u, ignore_index=True) if 'less' in alternative: result_table = result_table.append(result_table_l, ignore_index=True) if 'twosided' in alternative: result_table = result_table.append(result_table_ul, ignore_index=True) ordered_result_table = pd.DataFrame(result_table, columns=['Alternative', 'H1', 't_value', 'p_value', str(confidence_level * 100) + '% confidence interval']) rb = ReportBuilder() rb.addMD(strip_margin(""" |## Paired T Test Result |##### df : {deg_f} |##### Mean of differences : {dm} |##### Standard deviation : {sd} | |{result_table} | """.format(deg_f=df, dm=diff_mean, sd=std_dev, result_table=pandasDF2MD(ordered_result_table)))) model = dict() model['report'] = rb.get() return{'out_table':df_result, 'model':model}
def _hierarchical_clustering(table, input_cols, link='complete', met='euclidean', p=2, num_rows=20, figure_height=6.4, orient='right'): table = table.copy() df = table[input_cols] Z = linkage(df, method=link, metric=met) out_table = pd.DataFrame([]) out_table['linkage_step'] = [x + 1 for x in reversed(range(len(Z)))] out_table['joined_column1'] = ['pt_' + str(int(Z[:, 0][i])) for i in range(len(Z))] out_table['joined_column2'] = ['pt_' + str(int(Z[:, 1][i])) for i in range(len(Z))] out_table['name_of_clusters'] = ['CL_' + str(i + 1) for i in reversed(range(len(Z)))] out_table['distance'] = [distance for distance in Z[:, 2]] out_table['number_of_original'] = [int(entities) for entities in Z[:, 3]] # switch name of point to cluster name for i in range(len(Z)): if Z[:, 0][i] >= len(df) : out_table['joined_column1'][i] = out_table['name_of_clusters'][Z[:, 0][i] - len(df)] if Z[:, 1][i] >= len(df) : out_table['joined_column2'][i] = out_table['name_of_clusters'][Z[:, 1][i] - len(df)] out_table = out_table.reindex(index=out_table.index[::-1])[0:] out_table1 = out_table.head(num_rows) # calculate full dendrogram def _llf(id): n = len(df) if id < n: return 'pt_' + str(id) plt.figure(figsize=(8.4, figure_height)) _fancy_dendrogram( Z, truncate_mode='none', # show only the last p merged clusters (if another) get_leaves=True, orientation=orient, labels=True, leaf_label_func=_llf, leaf_rotation=45, leaf_font_size=5., show_contracted=False, # to get a distribution impression in truncated branches annotate_above=float(10), # useful in small plots so annotations don't overlap # max_d=distance_threshold, # will plot a horizontal cut-off line, max_d as in max_distance ) plt.title('Hierarchical Clustering Dendrogram') if orient=='top': plt.xlabel('Samples') plt.ylabel('Distance') elif orient=='right': plt.xlabel('Distance') plt.ylabel('Samples') plt2 = plt2MD(plt) plt.clf() rb = ReportBuilder() params = { 'Input Columns': input_cols, 'Linkage Method': link, 'Metric': met, 'Number of Rows in Linkage Matrix': num_rows } rb.addMD(strip_margin("""### Hierarchical Clustering Result""")) rb.addMD(strip_margin(""" |## Dendrogram | |{image} | |### Parameters | | {display_params} | |## Linkage Matrix | |{out_table1} | """.format(image=plt2, display_params=dict2MD(params), out_table1=pandasDF2MD(out_table1)))) model = _model_dict('hierarchical_clustering') model['model'] = Z model['input_cols'] = input_cols model['parameters'] = params model['outtable'] = out_table model['report'] = rb.get() return { 'model':model}
def _naive_bayes_train(table, feature_cols, label_col, alpha=1.0, fit_prior=True, class_prior=None): features = table[feature_cols] label = table[label_col] label_encoder = preprocessing.LabelEncoder() label_encoder.fit(label) label_correspond = label_encoder.transform(label) if class_prior is not None: tmp_class_prior = [0 for x in range(len(class_prior))] for elems in class_prior: tmp = elems.split(":") tmp_class_prior[label_encoder.transform([tmp[0] ])[0]] = float(tmp[1]) class_prior = tmp_class_prior nb_model = MultinomialNB(alpha, fit_prior, class_prior) nb_model.fit(features, label_correspond) class_log_prior = nb_model.class_log_prior_ feature_log_prob_ = nb_model.feature_log_prob_ tmp_result = np.hstack( (list(map(list, zip(*[label_encoder.classes_] + [class_log_prior]))), (feature_log_prob_))) column_names = ['labels', 'pi'] for feature_col in feature_cols: column_names += ['theta_' + feature_col] result_table = pd.DataFrame.from_records(tmp_result, columns=column_names) prediction_correspond = nb_model.predict(features) get_param = dict() get_param['Lambda'] = alpha # get_param['Prior Probabilities of the Classes'] = class_prior get_param['Fit Class Prior Probability'] = fit_prior get_param['Feature Columns'] = feature_cols get_param['Label Column'] = label_col cnf_matrix = confusion_matrix(label_correspond, prediction_correspond) plt.figure() _plot_confusion_matrix(cnf_matrix, classes=label_encoder.classes_, title='Confusion Matrix') fig_confusion_matrix = plt2MD(plt) accuracy = nb_model.score(features, label_correspond) * 100 rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Naive Bayes Classification Result | | ### Model:Multinomial | {result_table} | ### Parameters | {table_parameter} | ### Predicted vs Actual | {image1} | #### Accuacy = {accuracy}% | """.format(image1=fig_confusion_matrix, accuracy=accuracy, result_table=pandasDF2MD(result_table), table_parameter=dict2MD(get_param)))) model = _model_dict('naive_bayes_model') model['features'] = feature_cols model['label_col'] = label_col model['label_encoder'] = label_encoder model['nb_model'] = nb_model model['report'] = rb.get() return {'model': model}
def _chi_square_test_of_independence(table, response_cols, factor_col, correction=False): label_list = [] feature_list = [] alternative_hypothesis_list = [] dof_list = [] stat_chi_list = [] p_chi_list = [] for response_col in response_cols: response = table[response_col] contingency_table = pd.crosstab(table[response_col], table[factor_col], margins=True) response_index = len(contingency_table) - 1 factor_index = len(contingency_table.columns) - 1 temporary = contingency_table.iloc[0:response_index, 0:factor_index] f_object = np.array(temporary) test = stats.chi2_contingency(f_object, correction, 1)[0:3] label = '{factor_col}'.format(factor_col=factor_col) feature = '{response_col}'.format(response_col=response_col) if test[1] < 0.05: dependence = 'Reject the null hypothesis that two categorical variables are independent at 5% significance level.' elif test[1] >= 0.05: dependence = 'No association was found between two categorical variables at 5% significance level.' elif math.isnan(test[1]): dependence = 'Independence of two categorical variables cannot be decided.' conclusion = '{dependence}'.format(dependence=dependence) alternative_hypothesis = 'Two categorical variables are dependent.' dof = 'chi-square distribution with {dof} degrees of freedom'.format( dof=test[2]) stat_chi = '{stat_chi}'.format(stat_chi=test[0]) p_chi = '{p_chi}'.format(p_chi=test[1]) label_list.append(label) feature_list.append(feature) alternative_hypothesis_list.append(alternative_hypothesis) dof_list.append(dof) stat_chi_list.append(stat_chi) p_chi_list.append(p_chi) result_table = pd.DataFrame.from_items( [['label', label_list], ['feature', feature_list], ['alternative_hypothesis', alternative_hypothesis_list], ['df', dof_list], ['estimate', stat_chi_list], ['p_value', p_chi_list]]) result = dict() result['result_table'] = result_table rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Chi-square Test of Independence Result | - H0: the two categorical variables are independent. | - H1: the two categorical variables are dependent. """)) for response_col in response_cols: response = table[response_col] contingency_table = pd.crosstab(table[response_col], table[factor_col], margins=True) response_index = len(contingency_table) - 1 factor_index = len(contingency_table.columns) - 1 temporary = contingency_table.iloc[0:response_index, 0:factor_index] f_object = np.array(temporary) test = stats.chi2_contingency(f_object, correction, 1)[0:3] label = '{factor_col}'.format(factor_col=factor_col) feature = '{response_col}'.format(response_col=response_col) if test[1] < 0.05: dependence = 'Reject the null hypothesis that two categorical variables are independent at 5% significance level.' elif test[1] >= 0.05: dependence = 'No association was found between two categorical variables at 5% significance level.' elif math.isnan(test[1]): dependence = 'Independence of two categorical variables cannot be decided.' dof_simplelist = [] stat_chi_simplelist = [] p_chi_simplelist = [] dof = '{dof}'.format(dof=test[2]) stat_chi = '{stat_chi}'.format(stat_chi=test[0]) p_chi = '{p_chi}'.format(p_chi=test[1]) stat_chi_simplelist.append(stat_chi) dof_simplelist.append(dof) p_chi_simplelist.append(p_chi) result_table_simple = pd.DataFrame.from_items( [['estimate', stat_chi_simplelist], ['df', dof_simplelist], ['p_value', p_chi_simplelist]]) # test statistic = {test_statistic}, df = {dof}, p_value = {p_value} # test_statistic = stats.chi2_contingency(f_object,correction,lambda_)[0], dof=stats.chi2_contingency(f_object,correction,lambda_)[2], p_value=stats.chi2_contingency(f_object,correction,lambda_)[1] rb.addMD( strip_margin(""" |### Label: {label}, Feature: {feature} | |{result_table_simple} | |{dependence} | | """.format(label=factor_col, feature=response_col, result_table_simple=pandasDF2MD(result_table_simple), dependence=dependence))) model = _model_dict('Chi-square test of independence') model['report'] = rb.get() result_table = result_table.copy() return {'model': model}
def ftest_for_stacked_data(table, response_cols, factor_col, alternatives, first, second, confi_level=0.95): if (type(table[factor_col][0]) == str): table_first = table[table[factor_col] == first] table_second = table[table[factor_col] == second] elif (type(table[factor_col][0]) == bool): table_first = table[table[factor_col] == bool(first)] table_second = table[table[factor_col] == bool(second)] else: table_first = table[table[factor_col] == float(first)] table_second = table[table[factor_col] == float(second)] tmp_table = [] number1 = len(table_first[factor_col]) number2 = len(table_second[factor_col]) d_num = number1 - 1 d_denum = number2 - 1 rb = ReportBuilder() rb.addMD( strip_margin(""" ## F Test for Stacked Data Result | - Confidence level = {confi_level} | - Statistics = F statistic, F distribution with {d_num} numerator degrees of freedom and {d_denum} degrees of freedom under the null hypothesis """.format(confi_level=confi_level, d_num=d_num, d_denum=d_denum))) for response_col in response_cols: tmp_model = [] std1 = (table_first[response_col]).std() std2 = (table_second[response_col]).std() f_value = (std1**2) / (std2**2) if 'larger' in alternatives: p_value = scipy.stats.f.cdf(1 / f_value, d_num, d_denum) tmp_model += [ ['true ratio > 1'] + [p_value] + [(f_value / (scipy.stats.f.ppf(confi_level, d_num, d_denum)), math.inf)] ] tmp_table += [[ '%s by %s(%s,%s)' % (response_col, factor_col, first, second) ] + ['true ratio of variances > 1'] + [ 'F statistic, F distribution with %d numerator degrees of freedom and %d degrees of freedom under the null hypothesis.' % (d_num, d_denum) ] + [f_value] + [p_value] + [confi_level] + [ f_value / (scipy.stats.f.ppf(confi_level, d_num, d_denum)) ] + [math.inf]] if 'smaller' in alternatives: p_value = scipy.stats.f.cdf(f_value, d_num, d_denum) tmp_model += [['true ratio < 1'] + [p_value] + [(0, f_value * (scipy.stats.f.ppf(confi_level, d_denum, d_num)))]] tmp_table += [[ '%s by %s(%s,%s)' % (response_col, factor_col, first, second) ] + ['true ratio of variances < 1'] + [ 'F statistic, F distribution with %d numerator degrees of freedom and %d degrees of freedom under the null hypothesis.' % (d_num, d_denum) ] + [f_value] + [p_value] + [confi_level] + [0] + [ f_value * (scipy.stats.f.ppf(confi_level, d_denum, d_num)) ]] if 'two-sided' in alternatives: p_value_tmp = scipy.stats.f.cdf(1 / f_value, d_num, d_denum) if (p_value_tmp > 0.5): p_value = (1 - p_value_tmp) * 2 else: p_value = p_value_tmp * 2 tmp_model += [ ['true ratio != 1'] + [p_value] + [(f_value / (scipy.stats.f.ppf( (1 + confi_level) / 2, d_num, d_denum)), f_value * (scipy.stats.f.ppf((1 + confi_level) / 2, d_denum, d_num)))] ] tmp_table += [[ '%s by %s(%s,%s)' % (response_col, factor_col, first, second) ] + ['true ratio of variances != 1'] + [ 'F statistic, F distribution with %d numerator degrees of freedom and %d degrees of freedom under the null hypothesis.' % (d_num, d_denum) ] + [f_value] + [p_value] + [confi_level] + [ f_value / (scipy.stats.f.ppf( (1 + confi_level) / 2, d_num, d_denum)) ] + [ f_value * (scipy.stats.f.ppf( (1 + confi_level) / 2, d_denum, d_num)) ]] result_model = pd.DataFrame.from_records(tmp_model) result_model.columns = [ 'alternatives', 'p values', '%g%% confidence interval' % (confi_level * 100) ] rb.addMD( strip_margin(""" | #### Data = {response_col} by {factor_col}({first},{second}) | - Estimates= {f_value} | | {result_model} | """.format(response_col=response_col, factor_col=factor_col, first=first, second=second, f_value=f_value, result_model=pandasDF2MD(result_model)))) result = pd.DataFrame.from_records(tmp_table) result.columns = [ 'data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'lower_confidence_interval', 'upper_confidence_interval' ] model = dict() model['report'] = rb.get() return {'out_table': result, 'model': model}
def _paired_ttest(table, first_column, second_column, alternative=['greater', 'less', 'twosided'], hypothesized_difference=0, confidence_level=0.95): df = len(table) - 1 first_col = table[first_column] second_col = table[second_column] diff_mean = (first_col - second_col).mean() std_dev = np.std(first_col - second_col) t_value = stats.ttest_rel(table[first_column], table[second_column] + hypothesized_difference)[0] result = [] alternative_hypothesis = [] p_value = [] confidence_interval = [] if 'greater' in alternative: alternative_hypothesis.append('true difference in means > ' + str(hypothesized_difference)) p_value.append(stats.t.sf(t_value, df)) confidence_interval.append((diff_mean - std_dev * stats.t.isf( (1 - confidence_level), df) / np.sqrt(df), np.Infinity)) if 'less' in alternative: alternative_hypothesis.append('true difference in means < ' + str(hypothesized_difference)) p_value.append(stats.t.cdf(t_value, df)) confidence_interval.append( (-np.Infinity, diff_mean + std_dev * stats.t.isf( (1 - confidence_level), df) / np.sqrt(df))) if 'twosided' in alternative: alternative_hypothesis.append('true difference in means != ' + str(hypothesized_difference)) p_value.append( stats.ttest_rel(first_col, second_col + hypothesized_difference)[1]) other_term = std_dev * stats.t.isf( (1 - confidence_level) / 2, df) / np.sqrt(df) confidence_interval.append( (diff_mean - other_term, diff_mean + other_term)) result.append(['alternative hypothesis', alternative_hypothesis]) result.append(['t-value', t_value]) result.append(['p-value', p_value]) result.append([ '%g%% confidence Interval' % (confidence_level * 100), confidence_interval ]) result_table = pd.DataFrame.from_items(result) rb = ReportBuilder() rb.addMD( strip_margin(""" |## Paired T Test Result |##### df : {deg_f} |##### Mean of differences : {dm} |##### Standard deviation : {sd} | |{result_table} | """.format(deg_f=df, dm=diff_mean, sd=std_dev, result_table=pandasDF2MD(result_table)))) model = dict() model['report'] = rb.get() return {'model': model}
def _one_sample_ttest(table, input_cols, alternatives, hypothesized_mean=0, conf_level=0.95): cols = [ 'data', 'alternative_hypothesis', 'statistics', 't_value', 'p_value', 'confidence_level', 'lower_confidence_interval', 'upper_confidence_interval' ] out_table = pd.DataFrame(columns=cols) n = len(table) alpha = 1.0 - conf_level statistics = "t statistic, t distribution with %d degrees of freedom under the null hypothesis." % ( n - 1) # ## Build model rb = ReportBuilder() rb.addMD( strip_margin(""" ## One Sample T Test Result | - Statistics = {s} | - Hypothesized mean = {h} | - Confidence level = {cl} """.format(s=statistics, h=hypothesized_mean, cl=conf_level))) for input_col in input_cols: col = table[input_col] H1_list = [] p_list = [] CI_list = [] # width of the confidence interval width_one_sided = _width(col, alpha, n) width_two_sided = _width(col, alpha / 2, n) # t-statistic, two-tailed p-value t_value, p_value_two = stats.ttest_1samp(col, hypothesized_mean) # one-tailed p-value for Greater if t_value >= 0: p_value_one = p_value_two / 2 else: p_value_one = 1.0 - p_value_two / 2 for alter in alternatives: if alter == 'Greater': H1 = 'true mean > {hypothesized_mean}'.format( hypothesized_mean=hypothesized_mean) p_value = p_value_one lower_conf_interval = np.mean(col) - width_one_sided upper_conf_interval = np.inf if alter == 'Less': H1 = 'true mean < {hypothesized_mean}'.format( hypothesized_mean=hypothesized_mean) p_value = 1.0 - p_value_one lower_conf_interval = -np.inf upper_conf_interval = np.mean(col) + width_one_sided if alter == 'Two Sided': H1 = 'true mean != {hypothesized_mean}'.format( hypothesized_mean=hypothesized_mean) p_value = p_value_two lower_conf_interval = np.mean(col) - width_two_sided upper_conf_interval = np.mean(col) + width_two_sided # ## Build out_table out = pd.Series([ input_col, H1, statistics, t_value, p_value, conf_level, lower_conf_interval, upper_conf_interval ], index=cols) out_table = out_table.append(out, ignore_index=True) # ## Build model H1_list.append(H1) p_list.append(p_value) CI_list.append( '({lower_conf_interval}, {upper_conf_interval})'.format( lower_conf_interval=lower_conf_interval, upper_conf_interval=upper_conf_interval)) # ## Build model result_table = pd.DataFrame.from_items( [['alternative hypothesis', H1_list], ['p-value', p_list], ['%g%% confidence Interval' % (conf_level * 100), CI_list]]) rb.addMD( strip_margin(""" ### Data = {input_col} | - t-value = {t_value} | | {result_table} """.format(input_col=input_col, t_value=t_value, result_table=pandasDF2MD(result_table)))) model = dict() model['report'] = rb.get() return {'out_table': out_table, 'model': model}
def _xgb_classification_train(table, feature_cols, label_col, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective='binary:logistic', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None, sample_weight_eval_set=None): classifier = XGBClassifier(max_depth, learning_rate, n_estimators, silent, objective, booster, n_jobs, nthread, gamma, min_child_weight, max_delta_step, subsample, colsample_bytree, colsample_bylevel, reg_alpha, reg_lambda, scale_pos_weight, base_score, random_state, seed, missing) classifier.fit(table[feature_cols], table[label_col], sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set) # json get_param = classifier.get_params() feature_importance = classifier.feature_importances_ # plt.rcdefaults() plot_importance(classifier) plt.tight_layout() fig_plot_importance = plt2MD(plt) plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(classifier) # fig_plot_tree_UT = plt2MD(plt) # plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(classifier, rankdir='LR') # fig_plot_tree_LR = plt2MD(plt) # plt.rcdefaults() # plt.clf() model = _model_dict('xgb_classification_model') model['feature_cols'] = feature_cols model['label_col'] = label_col model['parameters'] = get_param model['feature_importance'] = feature_importance model['classifier'] = classifier # report # get_param_list = [] # get_param_list.append(['feature_cols', feature_cols]) # get_param_list.append(['label_col', label_col]) params = dict2MD(get_param) # for key, value in get_param.items(): # temp = [key, value] # get_param_list.append(temp) # get_param_df = pd.DataFrame(data=get_param_list, columns=['parameter', 'value']) feature_importance_df = pd.DataFrame(data=feature_importance, index=feature_cols).T rb = ReportBuilder() rb.addMD( strip_margin(""" | ## XGB Classification Train Result | | ### Plot Importance | {fig_importance} | | ### Feature Importance | {table_feature_importance} | | ### Parameters | {list_parameters} | """.format(fig_importance=fig_plot_importance, table_feature_importance=pandasDF2MD(feature_importance_df, 20), list_parameters=params))) model['report'] = rb.get() return {'model': model}