def _autocorrelation(table, input_col, nlags=20, conf_level=0.95): data = table[input_col] plt.figure() plot_acf(data, lags=nlags, alpha=1 - conf_level) fig_plt_acf = plt2MD(plt) plt.clf() plt.figure() plot_pacf(data, lags=nlags, alpha=1 - conf_level) fig_plt_pacf = plt2MD(plt) plt.clf() acf_ret = acf(data, nlags=nlags, alpha=1-conf_level) pacf_ret = pacf(data, nlags=nlags, alpha=1-conf_level) result_table1 = pd.DataFrame([]) result_table1['lag'] = list(range(nlags + 1)) result_table1['ACF'] = acf_ret[0] if conf_level is not None: result_table1['%g%% confidence Interval' % (conf_level * 100)] = [str((acf_ret[1][i][0], acf_ret[1][i][1])) for i in range(nlags + 1)] result_table2 = pd.DataFrame([]) result_table2['lag'] = list(range(nlags + 1)) result_table2['PACF'] = pacf_ret[0] if conf_level is not None: result_table2['%g%% confidence Interval' % (conf_level * 100)] = [str((pacf_ret[1][i][0], pacf_ret[1][i][1])) for i in range(nlags + 1)] rb = BrtcReprBuilder() rb.addMD(strip_margin("""# Autocorrelation / Partial Autocorrelation Result""")) rb.addMD(strip_margin(""" |## Autocorrelation | |{image1} | |### Autocorrelation Table | |{result_table1} | |## Partial Autocorrelation | |{image2} | |### Partial Autocorrelation Table | |{result_table2} | """.format(image1=fig_plt_acf, result_table1=pandasDF2MD(result_table1, num_rows=nlags + 1), image2=fig_plt_pacf, result_table2=pandasDF2MD(result_table2, num_rows=nlags + 1)))) model = _model_dict('autocorrelation') model['autocorrelation_table'] = result_table1 model['partial_autocorrelation_table'] = result_table2 model['_repr_brtc_'] = rb.get() return {'model':model}
def _unit_root_test(table, input_col, maxlag=None, regression='c', autolag='AIC'): if autolag == 'None': autolag = None result = adfuller(table[input_col], maxlag, regression, autolag) model = dict() if autolag is not None: rb = BrtcReprBuilder() rb.addMD( strip_margin(""" ## Augmented Dickey-Fuller unit root test result | - null hypothesis : A unit root is present in a time series sample | - alternative hypothesis : There is no unit root | - Test statistic : {adf} | - p-value : {p_value} | - Number of observations used for the ADF regression and calculation of the critical values : {nobs} | - Number of lags used : {usedlag} | - Critical values for the test statistic at the 1 %, 5 %, and 10 % levels : {critical_values} | - The maximized information criterion if autolag is not None : {icbest} | """.format(adf=result[0], p_value=result[1], usedlag=result[2], nobs=result[3], critical_values=result[4], icbest=result[5]))) else: rb = BrtcReprBuilder() rb.addMD( strip_margin(""" ## Augmented Dickey-Fuller unit root test result | - null hypothesis : A unit root is present in a time series sample | - alternative hypothesis : There is no unit root | - Test statistic : {adf} | - p-value : {p_value} | - Number of observations used for the ADF regression and calculation of the critical values : {nobs} | - Number of lags used : {usedlag} | - Critical values for the test statistic at the 1 %, 5 %, and 10 % levels : {critical_values} | """.format(adf=result[0], p_value=result[1], usedlag=result[2], nobs=result[3], critical_values=result[4]))) model['adf'] = result[0] model['p_value'] = result[1] model['usedlag'] = result[2] model['nobs'] = result[3] model['critical_values'] = result[4] if autolag is not None: model['icbest'] = result[5] model['_repr_brtc_'] = rb.get() return {'model': model}
def _hierarchical_clustering_post(table, model, num_clusters, cluster_col='prediction'): Z = model['model'] mode = model['input_mode'] if mode == 'matrix': distance_matrix = model['dist_matrix'] out_table = model['linkage_matrix'] predict = fcluster(Z, t=num_clusters, criterion='maxclust') if mode == 'original': prediction_table = table.copy() elif mode == 'matrix': prediction_table = distance_matrix prediction_table[cluster_col] = predict L, M = leaders(Z, predict) which_cluster = [] for leader in L: if leader in Z[:, 0]: select_indices = np.where(Z[:, 0] == leader)[0][0] which_cluster.append(out_table['joined_column1'][select_indices]) elif leader in Z[:, 1]: select_indices = np.where(Z[:, 1] == leader)[0][0] which_cluster.append(out_table['joined_column2'][select_indices]) clusters_info_table = pd.DataFrame([]) clusters_info_table[cluster_col] = M clusters_info_table['name_of_clusters'] = which_cluster clusters_info_table = clusters_info_table.sort_values(cluster_col) cluster_count = np.bincount(prediction_table[cluster_col]) cluster_count = cluster_count[cluster_count != 0] clusters_info_table['num_of_entities'] = list(cluster_count) rb = BrtcReprBuilder() rb.addMD( strip_margin("""### Hierarchical Clustering Post Process Result""")) rb.addMD( strip_margin(""" |### Parameters | |{display_params} | |## Clusters Information | |{clusters_info_table} | """.format(display_params=dict2MD(model['parameters']), clusters_info_table=pandasDF2MD(clusters_info_table)))) model = _model_dict('hierarchical_clustering_post') model['clusters_info'] = clusters_info_table model['_repr_brtc_'] = rb.get() return {'out_table': prediction_table, 'model': model}
def _chi_square_test_of_independence(table, feature_cols, label_col, correction=False): rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Chi-square Test of Independence Result | - H0: the two categorical variables are independent. | - H1: the two categorical variables are dependent. """)) model = _model_dict('chi_square_test_of_independence') for idx, feature_col in enumerate(feature_cols): contingency_table = pd.crosstab(table[feature_col], table[label_col], margins=True) feature_index = len(contingency_table) - 1 label_index = len(contingency_table.columns) - 1 temporary = contingency_table.iloc[0:feature_index, 0:label_index] test = stats.chi2_contingency(np.array(temporary), correction, 1) stat_chi = test[0] dof = test[2] p_chi = test[1] if p_chi < 0.05: dependence = 'Reject the null hypothesis that two categorical variables are independent at 5% significance level.' elif p_chi >= 0.05: dependence = 'No association was found between two categorical variables at 5% significance level.' elif math.isnan(p_chi): dependence = 'Independence of two categorical variables cannot be decided.' data = { 'estimate': stat_chi, 'df': dof, 'p_value': p_chi } result_table = pd.DataFrame([data], columns=['estimate', 'df', 'p_value']) model['result{}'.format(idx)] = result_table rb.addMD(strip_margin(""" |### Label: {label}, Feature: {feature} |###### Result Table {idx} | |{result_table} | |{dependence} | | """.format(label=label_col, feature=feature_col, idx=idx, result_table=pandasDF2MD(result_table), dependence=dependence))) model['_repr_brtc_'] = rb.get() return {'model':model}
def _ancova(table, response_cols, factor_col, between_col): rb = BrtcReprBuilder() rb.addMD(strip_margin(""" ## Analysis of Covariance Result """)) groups = table[between_col].unique() groups.sort() sum_len = np.sum([len(str(group)) for group in groups]) result = dict() result['_grouped_data'] = dict() for response_col in response_cols: data = table[response_col] result['_grouped_data'][response_col] = dict() ax = sns.boxplot(x=between_col, y=response_col, data=table, order=groups) if sum_len > 512: ax.set_xticklabels(ax.get_xticklabels(), rotation=90) elif sum_len > 64: ax.set_xticklabels(ax.get_xticklabels(), rotation=45) fig_box = plt2MD(plt) plt.clf() ancova_res = pg_ancova(data=table, dv=response_col, covar=factor_col, between=between_col) ancova_df = pandasDF2MD(ancova_res) rb.addMD( strip_margin(""" | ## {response_col} by {between_col} | {fig_box} | | ### ANCOVA | {ancova_df} """.format(response_col=response_col, between_col=between_col, fig_box=fig_box, ancova_df=ancova_df))) result['_repr_brtc_'] = rb.get() return {'result': result}
def _kruskal_wallis_test(table, response_cols, factor_col, nan_policy='propagate'): result = dict() rb = BrtcReprBuilder() rb.addMD("""## Kruskal Wallis test Result""") groups = dict() for name, group in table.groupby(factor_col): groups[name] = group for response_col in response_cols: stats, pval = kruskal(*[x[response_col] for x in groups.values()]) rb.addMD( strip_margin(""" | ## {response_col} by {factor_col} | | ### Statistics value: {stats} | | ### P value: {pval} """.format(response_col=response_col, factor_col=factor_col, stats=stats, pval=pval))) name = response_col + '_' + factor_col result[name] = dict() result[name]['Statistics'] = stats result[name]['P value'] = pval result['_repr_brtc_'] = rb.get() return {'result': result}
def _mann_whitney_test(table, response_col, factor_col, use_continuity=True): result = dict() rb = BrtcReprBuilder() rb.addMD("""## Mann Whitney test Result""") groups = dict() uniq_factor = table[factor_col].unique() for name in uniq_factor: groups[name] = np.array(table[response_col])[np.where(table[factor_col] == name)] group_name = [] stats = [] pvals = [] for name1, name2 in itertools.combinations(uniq_factor, 2): name = str(name1) + ' vs ' + str(name2) stat, pval = mannwhitneyu(groups[name1], groups[name2], use_continuity=use_continuity) group_name.append(name) stats.append(stat) pvals.append(pval) result[name] = dict() result[name]['Statistics'] = stat result[name]['P value'] = pval rb.addMD(strip_margin(""" | {table} """.format(table=pandasDF2MD(pd.DataFrame({'': group_name, 'Test Statistics': stats, 'P Value': pvals}))))) result['_repr_brtc_'] = rb.get() return {'result': result}
def _plot_roc_pr_curve(table, label_col, probability_col, fig_w=6.4, fig_h=4.8, pos_label=None): label = table[label_col] probability = table[probability_col] threshold, fig_tpr_fpr, fig_roc, fig_precision_recall, fig_pr, fig_confusion = _plot_binary(label, probability, fig_size=(fig_w, fig_h), pos_label=pos_label) summary = dict() summary['threshold'] = threshold summary['label_col'] = label_col summary['probability_col'] = probability_col rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Plot ROC Curve and PR Curve Result | | ### ROC Curve | {fig_tpr_fpr} | {fig_roc} | | ### PR Curve | {fig_precision_recall} | {fig_pr} | | ### Confusion Matrix | {fig_confusion} """.format(fig_roc=fig_roc, fig_tpr_fpr=fig_tpr_fpr, fig_pr=fig_pr, fig_precision_recall=fig_precision_recall, fig_confusion=fig_confusion ))) summary['_repr_brtc_'] = rb.get() return {'result' : summary}
def _ada_boost_classification_train(table, feature_cols, label_col, max_depth=1, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None): x_train = table[feature_cols] y_train = table[label_col] base_estimator = DecisionTreeClassifier(max_depth=max_depth) classifier = AdaBoostClassifier(base_estimator, n_estimators, learning_rate, algorithm, random_state) classifier.fit(x_train, y_train) params = { 'feature_cols': feature_cols, 'label_col': label_col, 'feature_importance': classifier.feature_importances_, 'n_estimators': n_estimators, 'learning_rate': learning_rate, 'algorithm': algorithm, 'random_state': random_state } model = _model_dict('ada_boost_classification_model') get_param = classifier.get_params() model['parameters'] = get_param model['classifier'] = classifier model['params'] = params fig_feature_importance = _plot_feature_importance(feature_cols, classifier) params = dict2MD(get_param) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## AdaBoost Classification Train Result | | ### Feature Importance | {fig_feature_importance} | | ### Parameters | {list_parameters} | """.format(fig_feature_importance=fig_feature_importance, list_parameters=params))) model['_repr_brtc_'] = rb.get() feature_importance = classifier.feature_importances_ feature_importance_table = pd.DataFrame( [[feature_cols[i], feature_importance[i]] for i in range(len(feature_cols))], columns=['feature_name', 'importance']) model['feature_importance_table'] = feature_importance_table return {'model': model}
def _ljung_box_test(table, input_cols, lags=None): result = dict() rb = BrtcReprBuilder() rb.addMD("""## Ljung Box test Result""") for input_col in input_cols: lbvalue, pvalue = acorr_ljungbox(x=table[input_col], lags=lags) lb_res = dict() lb_res['lags'] = range(1, len(lbvalue) + 1) lb_res['test statistic'] = lbvalue lb_res['p-value based on chi-square distribution'] = pvalue lb_res = pd.DataFrame(lb_res) rb.addMD( strip_margin(""" | ## {input_col} test result | | {lb_res} """.format(input_col=input_col, lb_res=pandasDF2MD(lb_res, num_rows=lb_res.shape[0])))) result[input_col] = lb_res result['_repr_brtc_'] = rb.get() return {'result': result}
def _wilcoxon_test(table, response_col, factor_col, zero_method='wilcox', correction=False): result = dict() rb = BrtcReprBuilder() rb.addMD("""## Wilcoxon Test Result""") groups = dict() for name, group in table.groupby(factor_col): groups[name] = group for name1, name2 in itertools.combinations(groups.keys(), 2): stats, pval = wilcoxon(x=groups[name1][response_col], y=groups[name2][response_col], zero_method=zero_method, correction=correction) rb.addMD( strip_margin(""" | ## {name1} vs {name2} | | ### The sum of the ranks of the differences: {stats} | | ### The two-sided p-value for the test: {pval} """.format(name1=name1, name2=name2, stats=stats, pval=pval))) name = str(name1) + '_' + str(name2) result[name] = dict() result[name]['Statistics'] = stats result[name]['P value'] = pval result['_repr_brtc_'] = rb.get() return {'result': result}
def _timeseries_decomposition(table, input_col, frequency, model_type='additive', filteration=None, two_sided=True, extrapolate_trend=0): out_table = table.copy() decomposition = sm.tsa.seasonal_decompose(out_table[input_col], model=model_type, filt=filteration, freq=frequency, two_sided=two_sided, extrapolate_trend=extrapolate_trend) decomposition.plot() plt2 = plt2MD(plt) plt.clf() out_table['trend'] = decomposition.trend out_table['seasonal'] = decomposition.seasonal out_table['residual'] = decomposition.resid rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Time Series Decomposition Result | Model Type : {model_type} | | {image2} | """.format(model_type=model_type, image2=plt2))) model = _model_dict('timeseries_decomposition') model['model_type'] = model_type model['_repr_brtc_'] = rb.get() return {'out_table':out_table, 'model':model}
def agglomerative_clustering_train_predict(input_table, input_cols, n_clusters=3, affinity='euclidean', compute_full_tree=True, linkage='ward', prediction_col='prediction', figw=6.4, figh=4.8): inputarr = input_table[input_cols] agglomerative_clustering = SKAgglomerativeClustering(n_clusters=n_clusters, affinity=affinity, memory=None, connectivity=None, compute_full_tree=compute_full_tree, linkage=linkage) agglomerative_clustering.fit(inputarr) input_table[prediction_col] = agglomerative_clustering.labels_ children = agglomerative_clustering.children_ distance = np.arange(children.shape[0]) no_of_observations = np.arange(2, children.shape[0] + 2) linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float) plt.figure(figsize=(figw, figh)) dendrogram(linkage_matrix) plot_dendrogram = plt2MD(plt) plt.clf() rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Agglomerative Clustering Result | {plot_dendrogram} """.format(plot_dendrogram=plot_dendrogram))) agglomerative_clustering_result = {'model':agglomerative_clustering, 'input_cols':input_cols, '_repr_brtc_':rb.get()} return {'out_table': input_table, 'agglomerative_result':agglomerative_clustering_result}
def _logistic_regression_train(table, feature_cols, label_col, penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1): features = table[feature_cols] label = table[label_col] if(sklearn_utils.multiclass.type_of_target(label) == 'continuous'): raise_error('0718', 'label_col') lr_model = LogisticRegression(penalty, dual, tol, C, fit_intercept, intercept_scaling, class_weight, random_state, solver, max_iter, multi_class, verbose, warm_start, n_jobs) lr_model.fit(features, label) intercept = lr_model.intercept_ coefficients = lr_model.coef_ classes = lr_model.classes_ is_binary = len(classes) == 2 if (fit_intercept == True): summary = pd.DataFrame({'features': ['intercept'] + feature_cols}) print(intercept) print(coefficients) coef_trans = np.concatenate(([intercept], np.transpose(coefficients)), axis=0) if not is_binary: summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=classes)), axis=1) elif is_binary: summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1) else: summary = pd.DataFrame({'features': feature_cols}) coef_trans = np.transpose(coefficients) if not is_binary: summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=classes)), axis=1) elif is_binary: summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1) rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Logistic Regression Result | ### Summary | {table1} """.format(table1=pandasDF2MD(summary) ))) model = _model_dict('logistic_regression_model') model['features'] = feature_cols model['label'] = label_col model['intercept'] = lr_model.intercept_ model['coefficients'] = lr_model.coef_ model['class'] = lr_model.classes_ model['penalty'] = penalty model['solver'] = solver model['lr_model'] = lr_model model['_repr_brtc_'] = rb.get() return {'model' : model}
def _mann_whitney_test(table, response_col, factor_col, use_continuity=True): result = dict() rb = BrtcReprBuilder() rb.addMD("""## Mann Whitney test Result""") groups = dict() uniq_factor = table[factor_col].unique() for name in uniq_factor: groups[name] = np.array( table[response_col])[np.where(table[factor_col] == name)] for name1, name2 in itertools.combinations(uniq_factor, 2): stats, pval = mannwhitneyu(groups[name1], groups[name2], use_continuity=use_continuity) rb.addMD( strip_margin(""" | ## {name1} vs {name2} | | ### Statistics U value: {stats} | | ### P value: {pval} """.format(name1=name1, name2=name2, stats=stats, pval=pval))) name = str(name1) + '_' + str(name2) result[name] = dict() result[name]['Statistics'] = stats result[name]['P value'] = pval result['_repr_brtc_'] = rb.get() return {'result': result}
def _svm_classification_train(table, feature_cols, label_col, c=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=True, tol=1e-3, max_iter=-1, random_state=None): validate(greater_than(c, 0.0, 'c')) _table = table.copy() _feature_cols = _table[feature_cols] _label_col = _table[label_col] if(sklearn_utils.multiclass.type_of_target(_label_col) == 'continuous'): raise_runtime_error('''Label Column should not be continuous.''') _svc = svm.SVC(C=c, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking, probability=probability, tol=tol, max_iter=max_iter, random_state=random_state) _svc_model = _svc.fit(_feature_cols, _label_col) get_param = _svc.get_params() get_param['feature_cols'] = feature_cols get_param['label_col'] = label_col rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## SVM Classification Result | ### Parameters | {table_parameter} """.format(table_parameter=dict2MD(get_param)))) _model = _model_dict('svc_model') _model['svc_model'] = _svc_model _model['features'] = feature_cols _model['_repr_brtc_'] = rb.get() return {'model':_model}
def _label_encoder2(table, input_cols, suffix='_index'): out_table = table.copy() out_model_list = [None] * len(input_cols) new_col_list = [] number_distinct_classes = [] for ind, col in enumerate(input_cols): le = LabelEncoder().fit(table[col]) out_model_list[ind] = le new_col_name = col + suffix new_col_list.append(new_col_name) number_distinct_classes.append(len(le.classes_)) out_table[new_col_name] = le.transform(table[col]) out_model = _model_dict('label_encoders') out_model['label_encoders'] = out_model_list out_model['input_cols'] = input_cols rb = BrtcReprBuilder() params = {"Input columns": input_cols, "Suffix": suffix} summary_table = pd.DataFrame() summary_table['Input columns'] = input_cols summary_table['No. distinct classes'] = number_distinct_classes summary_table['New column names'] = new_col_list rb.addMD( strip_margin(""" | ## Label Encoder Model | ### Parameters | {params} | ### Summary | {summary_table} """.format(params=dict2MD(params), summary_table=pandasDF2MD(summary_table)))) out_model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'model': out_model}
def _wilcoxon_test2(table, first_col, second_col, zero_method='wilcox', correction=False): result = dict() rb = BrtcReprBuilder() rb.addMD("""## Wilcoxon Test Result""") alter_hypothesis = [] stats = [] pvals = [] stat, pval = wilcoxon(x=table[first_col], y=table[second_col], zero_method=zero_method, correction=correction) alter_hypothesis.append('Median of the differences != 0') stats.append(stat) pvals.append(pval) result_table = pd.DataFrame({ 'Alternative hypothesis': alter_hypothesis, 'Sum of differences ranks': stats, 'P-value': pvals }) rb.addMD( strip_margin(""" | {table} """.format(table=pandasDF2MD(result_table)))) result['_repr_brtc_'] = rb.get() return {'result': result}
def _kruskal_wallis_test(table, response_cols, factor_col, nan_policy='propagate'): result = dict() rb = BrtcReprBuilder() rb.addMD("""## Kruskal Wallis test Result""") groups = dict() for name, group in table.groupby(factor_col): groups[name] = group group_name = [] df = [len(groups) - 1] * len(response_cols) stats = [] pvals = [] for response_col in response_cols: stat, pval = kruskal(*[x[response_col] for x in groups.values()]) group_name.append(response_col + ' by ' + factor_col) stats.append(stat) pvals.append(pval) name = response_col + '_' + factor_col result[name] = dict() result[name]['Statistics'] = stat result[name]['P value'] = pval rb.addMD(strip_margin(""" | {table} """.format(table=pandasDF2MD(pd.DataFrame({'': group_name, 'Degree of Freedom': df, 'Test Statistics': stats, 'P value': pvals}))))) result['_repr_brtc_'] = rb.get() return {'result': result}
def _paired_ttest(table, first_column, second_column, alternative, hypothesized_difference=0, confidence_level=0.95): df = len(table) - 1 first_col = table[first_column] second_col = table[second_column] diff_mean = (first_col - second_col).mean() std_dev = np.std(first_col - second_col) t_value = stats.ttest_rel(table[first_column], table[second_column] + hypothesized_difference)[0] result = [] alternative_hypothesis = [] p_value = [] confidence_interval = [] if 'greater' in alternative: alternative_hypothesis.append('true difference in means > ' + str(hypothesized_difference)) p_value.append(stats.t.sf(t_value, df)) confidence_interval.append((diff_mean - std_dev * stats.t.isf((1 - confidence_level), df) / np.sqrt(df), np.Infinity)) if 'less' in alternative: alternative_hypothesis.append('true difference in means < ' + str(hypothesized_difference)) p_value.append(stats.t.cdf(t_value, df)) confidence_interval.append((-np.Infinity, diff_mean + std_dev * stats.t.isf((1 - confidence_level), df) / np.sqrt(df))) if 'twosided' in alternative: alternative_hypothesis.append('true difference in means != ' + str(hypothesized_difference)) p_value.append(stats.ttest_rel(first_col, second_col + hypothesized_difference)[1]) other_term = std_dev * stats.t.isf((1 - confidence_level) / 2, df) / np.sqrt(df) confidence_interval.append((diff_mean - other_term, diff_mean + other_term)) result.append(['alternative hypothesis', alternative_hypothesis]) result.append(['p-value', p_value]) result.append(['%g%% confidence Interval' % (confidence_level * 100), confidence_interval]) result_table = pd.DataFrame.from_items(result) rb = BrtcReprBuilder() rb.addMD(strip_margin(""" |## Paired T Test Result |##### df : {deg_f} |##### Mean of differences : {dm} |##### Standard deviation : {sd} |##### t-value : {tv} | |#### Summary | |{result_table} | """.format(deg_f=df, dm=diff_mean, sd=std_dev, tv=t_value, result_table=pandasDF2MD(result_table)))) model = dict() model['_repr_brtc_'] = rb.get() model['degree_of_freedom'] = df model['mean_of_differences'] = diff_mean model['standard_deviation'] = std_dev model['t_value'] = t_value model['summary'] = result_table return{'model':model}
def _ada_boost_regression_train(table, feature_cols, label_col, max_depth=3, n_estimators=50, learning_rate=1.0, loss='linear', random_state=None): feature_names, x_train = check_col_type(table, feature_cols) y_train = table[label_col] base_estimator = DecisionTreeRegressor(max_depth=max_depth) regressor = AdaBoostRegressor(base_estimator, n_estimators, learning_rate, loss, random_state) regressor.fit(x_train, y_train) params = { 'feature_cols': feature_cols, 'label_col': label_col, 'feature_importance': regressor.feature_importances_, 'n_estimators': n_estimators, 'learning_rate': learning_rate, 'loss': loss, 'random_state': random_state } model = _model_dict('ada_boost_regression_model') get_param = regressor.get_params() model['parameters'] = get_param model['regressor'] = regressor model['params'] = params fig_feature_importance = _plot_feature_importance(feature_names, regressor) params = dict2MD(get_param) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## AdaBoost Regression Train Result | | ### Feature Importance | {fig_feature_importance} | | ### Parameters | {list_parameters} | """.format(fig_feature_importance=fig_feature_importance, list_parameters=params))) model['_repr_brtc_'] = rb.get() feature_importance = regressor.feature_importances_ feature_importance_table = pd.DataFrame( [[feature_names[i], feature_importance[i]] for i in range(len(feature_names))], columns=['feature_name', 'importance']) model['feature_importance_table'] = feature_importance_table return {'model': model}
def _bow(table, input_col, add_words=None, no_below=1, no_above=0.8, keep_n=10000): word_list = table[input_col].tolist() dictionary = Dictionary(word_list) if add_words != None: dictionary.add_documents([add_words]) dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n, keep_tokens=None) params = { 'Input Column': input_col, 'Minimum Number of Occurrence': no_below, 'Maximum Fraction of Occurrence': no_above, 'Keep N most Frequent': keep_n } empty_description = '' if len(list(dictionary.dfs.values())) == 0: out_table = pd.DataFrame([], columns=['token', 'document_frequency']) empty_description = 'Out table is empty since parameter \"Minimum Number of Occurrence\" is greater than the maximum of document frequency.' else: out_table = pd.DataFrame.from_dict(dictionary.token2id, orient='index').drop([0], axis=1) out_table.insert(loc=0, column='token', value=dictionary.token2id.keys()) token_cnt = sorted(dictionary.dfs.items(), key=operator.itemgetter(0)) dfs_list = [] for i in range(len(dictionary.dfs)): dfs_list.append(token_cnt[i][1]) out_table['document_frequency'] = dfs_list rb = BrtcReprBuilder() rb.addMD( strip_margin(""" |# Bag of Words Result |### Parameters | | {display_params} | | {description} | """.format(display_params=dict2MD(params), description=empty_description))) model = _model_dict('bow') model['dict_table'] = out_table model['dictionary'] = dictionary model['add_words'] = add_words model['_repr_brtc_'] = rb.get() return {'model': model, 'out_table': out_table}
def _holt_winters_train(table, input_cols, period, model_type='additive'): rb = BrtcReprBuilder() model = _model_dict('holt_winters_train') rb.addMD( strip_margin(""" | |## Holt-Winters Train Result | """.format())) for column in input_cols: hw = ExponentialSmoothing(table[column], trend=model_type, seasonal=model_type, seasonal_periods=period).fit() model['hw_' + str(column)] = hw model['origin_table'] = table rb.addMD( strip_margin(""" | |### Column : {col} | | - Model Type : {mt} | - Period : {pd} | - SSE : {sse} | - AIC : {aic} | - BIC : {bic} | """.format(col=column, mt=model_type, pd=period, sse=hw.sse, aic=hw.aic, bic=hw.bic))) model['sse_' + str(column)] = hw.sse model['aic_' + str(column)] = hw.aic model['bic_' + str(column)] = hw.bic model['input_columns'] = input_cols model['_repr_brtc_'] = rb.get() model['model_type'] = model_type model['period'] = period return {'model': model}
def test_var_pop(self): query = strip_margin(''' | select var_pop(sepal_length) as var_pop_sepal_length from #{DF(0)} ''') result_df = sql_execute(df_iris, query)['out_table'] print(result_df) self.assertAlmostEqual(0.6811222222222235, result_df.values[0][0], 10, 'var_pop gives a wrong result.')
def test_split(self): query = strip_margin(''' | select split(species, 't') from #{DF(0)} ''') result_df = sql_execute(df_iris, query)['out_table'] print(result_df) self.assertEqual(['se', 'osa'], result_df.values[0][0], 'split gives a wrong result.')
def test_exp2(self): query = strip_margin(''' | select exp2(sepal_length), exp2(10) from #{DF(0)} ''') result_df = sql_execute(df_iris, query)['out_table'] print(result_df) self.assertAlmostEqual(34.29675080116137, result_df.values[0][0], 10, 'exp2 gives a wrong result.')
def test_exp(self): query = strip_margin(''' | select exp(sepal_length), exp(10) from #{DF(0)} ''') result_df = sql_execute(df_iris, query)['out_table'] print(result_df) self.assertAlmostEqual(164.0219072999017, result_df.values[0][0], 10, 'exp gives a wrong result.')
def test_log2(self): query = strip_margin(''' | select log2(sepal_length), log2(10) from #{DF(0)} ''') result_df = sql_execute(df_iris, query)['out_table'] print(result_df) self.assertAlmostEqual(2.350497247084133, result_df.values[0][0], 10, 'log2 gives a wrong result.')
def test_log10(self): query = strip_margin(''' | select log10(sepal_length), log10(10) from #{DF(0)} ''') result_df = sql_execute(df_iris, query)['out_table'] print(result_df) self.assertAlmostEqual(0.7075701760979364, result_df.values[0][0], 10, 'log10 gives a wrong result.')
def test_pi(self): query = strip_margin(''' | select sepal_length + pi() from #{DF(0)} ''') result_df = sql_execute(df_iris, query)['out_table'] print(result_df) self.assertAlmostEqual(8.241592653589793, result_df.values[0][0], 10, 'pi gives a wrong result.')