def default(self, obj): # TODO add more support types if isinstance(obj, set): return list(obj) elif isinstance(obj, numpy.ndarray): return _to_default_list(obj) else: rb = BrtcReprBuilder() rb.addRawTextMD(str(obj)) return {'type': 'python object', '_repr_brtc_': rb.get()}
def _profile_table(table, bins=10, check_correlation=False, correlation_threshold=0.9, correlation_overrides=None): rb = BrtcReprBuilder() profile = pd_profiling.ProfileReport(table, bins=bins, check_correlation=check_correlation, correlation_threshold=correlation_threshold, correlation_overrides=correlation_overrides) rb.addHTML(profile.html) summary = dict() summary['_repr_brtc_'] = rb.get() return {'result': summary}
def _kmeans_train_predict(table, input_cols, n_clusters=3, prediction_col='prediction', init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto', seed=None, n_jobs=1, algorithm='auto', n_samples=None): inputarr = table[input_cols] if n_samples is None: n_samples = len(inputarr) validate(greater_than_or_equal_to(n_clusters, 1, 'n_clusters'), greater_than_or_equal_to(n_init, 1, 'n_init'), greater_than_or_equal_to(max_iter, 1, 'max_iter'), greater_than(tol, 0.0, 'tol'), greater_than_or_equal_to(n_jobs, 1, 'n_jobs'), greater_than_or_equal_to(n_samples, 0, 'n_samples')) k_means = SKKMeans(n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter, tol=tol, precompute_distances=precompute_distances, verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm) k_means.fit(inputarr) params = {'input_cols':input_cols, 'n_clusters':n_clusters, 'init':init, 'n_init':n_init, 'max_iter':max_iter, 'tol':tol, 'precompute_distances':precompute_distances, 'seed':seed, 'n_jobs':n_jobs, 'algorithm':algorithm} cluster_centers = k_means.cluster_centers_ labels = k_means.labels_ pca2_model = PCA(n_components=2).fit(inputarr) pca2 = pca2_model.transform(inputarr) fig_centers = _kmeans_centers_plot(input_cols, cluster_centers) fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, cluster_centers) fig_pca = _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2) rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Kmeans Result | - Number of iterations run: {n_iter_}. | - Coordinates of cluster centers | {fig_cluster_centers} | - Samples | {fig_pca} | {fig_samples} | | ### Parameters | {params} """.format(n_iter_=k_means.n_iter_, fig_cluster_centers=fig_centers, fig_pca=fig_pca, fig_samples=fig_samples, params=dict2MD(params)))) model = _model_dict('kmeans') model['model'] = k_means model['input_cols'] = input_cols model['_repr_brtc_'] = rb.get() out_table = table.copy() out_table[prediction_col] = labels return {'out_table':out_table, 'model':model}
def _discretize_quantile(table, input_col, num_of_buckets=2, out_col_name='bucket_number'): out_table = table.copy() out_table[out_col_name], buckets = pd.qcut(table[input_col], num_of_buckets, labels=False, retbins=True, precision=10, duplicates='drop') params = { 'input_col': input_col, 'num_of_buckets': num_of_buckets, 'out_col_name': out_col_name } cnt = Counter(out_table[out_col_name].values) # index_list, bucket_list index_list = [] bucket_list = [] cnt_list = [] for i in range(len(buckets) - 1): left = '[' if i == 0 else '(' index_list.append(i) cnt_list.append(cnt[i]) bucket_list.append("{left}{lower}, {upper}]".format( left=left, lower=buckets[i], upper=buckets[i + 1])) # 'buckets' is tuple type data. # Build model result = pd.DataFrame.from_items([['bucket number', index_list], ['buckets', bucket_list], ['count', cnt_list]]) # Build model rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Quantile-based Discretization Result | ### Result | {result} | | ### Parameters | {params} """.format(result=pandasDF2MD(result), params=dict2MD(params)))) model = _model_dict('discretize_quantile') model['result'] = result model['params'] = params model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'model': model}
def _evaluate_regression(table, label_col, prediction_col): label = table[label_col] predict = table[prediction_col] # compute metrics evs = explained_variance_score(label, predict) mse = mean_squared_error(label, predict) rmse = np.sqrt(mse) mae = mean_absolute_error(label, predict) mape = _mean_absolute_percentage_error(label, predict) mdae = median_absolute_error(label, predict) r2 = r2_score(label, predict) # json summary = dict() summary['label_col'] = label_col summary['prediction_col'] = prediction_col summary['r2_score'] = r2 summary['mean_squared_error'] = mse summary['root_mean_squared_error'] = rmse summary['mean_absolute_error'] = mae summary['median_absolute_error'] = mdae summary['explained_variance_score'] = evs # report all_dict_list = [{ 'r2_score': r2, 'mean_squared_error': mse, 'root_mean_squared_error': rmse, 'mean_absolute_error': mae, 'mean_absolute_percentage_error': mape, 'median_absolute_error': mdae, 'explained_variance_score': evs }] all_df = pd.DataFrame(all_dict_list) all_df = all_df[[ 'r2_score', 'mean_squared_error', 'root_mean_squared_error', 'mean_absolute_error', 'mean_absolute_percentage_error', 'median_absolute_error', 'explained_variance_score' ]] summary['metrics'] = all_df rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Evaluate Regression Result | ### Metrics | {table1} | | """.format(table1=pandasDF2MD(all_df)))) summary['_repr_brtc_'] = rb.get() return {'result': summary}
def default(self, obj): if isinstance(obj, set): return list(obj) elif isinstance(obj, numpy.ndarray): return obj.tolist() # TODO add more support types else: # elif hasattr(obj, '__str__'): rb = BrtcReprBuilder() rb.addRawTextMD(str(obj)) return {'type':'python object', '_repr_brtc_':rb.get()}
def _random_forest_classification_train(table, feature_cols, label_col, n_estimators=10, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0, max_features="sqrt", max_leaf_nodes=None, min_impurity_decrease=0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None): X_train = table[feature_cols] y_train = table[label_col] if max_features == "None": max_features = None classifier = RandomForestClassifier(n_estimators, criterion, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, max_leaf_nodes, min_impurity_decrease) # , bootstrap, oob_score, n_jobs, random_state, verbose, warm_start, class_weight) classifier.fit(X_train, y_train) params = {'feature_cols': feature_cols, 'label_col': label_col, 'n_estimators': n_estimators, 'criterion': criterion, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_features': max_features, 'max_leaf_nodes': max_leaf_nodes, 'min_impurity_decrease': min_impurity_decrease, 'bootstrap': bootstrap, 'oob_score': oob_score, 'n_jobs': n_jobs, 'random_state': random_state, 'verbose': verbose, 'warm_start': warm_start, 'class_weight': class_weight} model = dict() # get_param = classifier.get_params() model['classifier'] = classifier model['params'] = params fig_feature_importances = _plot_feature_importances(feature_cols, classifier) rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Random Forest Classification Train Result | | ### Feature Importance | {fig_feature_importances} | """.format(fig_feature_importances=fig_feature_importances))) model['_repr_brtc_'] = rb.get() return {'model' : model}
def _svm_classification_train(table, feature_cols, label_col, c=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=True, tol=1e-3, max_iter=-1, random_state=None): validate(greater_than(c, 0.0, 'c')) _table = table.copy() _feature_cols = _table[feature_cols] _label_col = _table[label_col] if (sklearn_utils.multiclass.type_of_target(_label_col) == 'continuous'): raise_runtime_error('''Label Column should not be continuous.''') _svc = svm.SVC(C=c, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking, probability=probability, tol=tol, max_iter=max_iter, random_state=random_state) _svc_model = _svc.fit(_feature_cols, _label_col) get_param = _svc.get_params() get_param['feature_cols'] = feature_cols get_param['label_col'] = label_col rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## SVM Classification Result | ### Parameters | {table_parameter} """.format(table_parameter=dict2MD(get_param)))) _model = _model_dict('svc_model') _model['svc_model'] = _svc_model _model['features'] = feature_cols _model['_repr_brtc_'] = rb.get() return {'model': _model}
def _scale(table, input_cols, scaler, suffix=None): if scaler == 'RobustScaler': if suffix is None: suffix = '_robust' scale = RobustScaler() elif scaler == 'StandardScaler': if suffix is None: suffix = '_standard' scale = StandardScaler() elif scaler == 'MaxAbsScaler': if suffix is None: suffix = '_max_abs' scale = MaxAbsScaler() else: # minmax if suffix is None: suffix = '_min_max' scale = MinMaxScaler() scaled_cols = [] for col in input_cols: scaled_cols.append(col + suffix) out_table = table.copy() scaled_table = scale.fit_transform(out_table[input_cols]) out_table[scaled_cols] = pd.DataFrame(data=scaled_table) out_model = _model_dict('scaler') out_model['input_cols'] = input_cols out_model['used_scaler'] = scaler out_model['scaler'] = scale out_model['suffix'] = suffix rb = BrtcReprBuilder() params = { "Input columns": input_cols, "Normalization method": scaler, "Suffix": suffix } summary_table = pd.DataFrame() summary_table['Input columns'] = input_cols summary_table['Normalization method'] = [scaler] * len(input_cols) summary_table['New column names'] = scaled_cols rb.addMD( strip_margin(""" | ## Label Encoder Model | ### Parameters | {params} | | ### Summary table | {summary_table} """.format(params=dict2MD(params), summary_table=pandasDF2MD(summary_table)))) out_model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'model': out_model}
def _unit_root_test(table, input_col, maxlag=None, regression='c', autolag='AIC'): if autolag == 'None': autolag = None result = adfuller(table[input_col], maxlag, regression, autolag) model = dict() if autolag is not None: rb = BrtcReprBuilder() rb.addMD( strip_margin(""" ## Augmented Dickey-Fuller unit root test result | - null hypothesis : A unit root is present in a time series sample | - alternative hypothesis : There is no unit root | - Test statistic : {adf} | - p-value : {p_value} | - Number of observations used for the ADF regression and calculation of the critical values : {nobs} | - Number of lags used : {usedlag} | - Critical values for the test statistic at the 1 %, 5 %, and 10 % levels : {critical_values} | - The maximized information criterion if autolag is not None : {icbest} | """.format(adf=result[0], p_value=result[1], usedlag=result[2], nobs=result[3], critical_values=result[4], icbest=result[5]))) else: rb = BrtcReprBuilder() rb.addMD( strip_margin(""" ## Augmented Dickey-Fuller unit root test result | - null hypothesis : A unit root is present in a time series sample | - alternative hypothesis : There is no unit root | - Test statistic : {adf} | - p-value : {p_value} | - Number of observations used for the ADF regression and calculation of the critical values : {nobs} | - Number of lags used : {usedlag} | - Critical values for the test statistic at the 1 %, 5 %, and 10 % levels : {critical_values} | """.format(adf=result[0], p_value=result[1], usedlag=result[2], nobs=result[3], critical_values=result[4]))) model['adf'] = result[0] model['p_value'] = result[1] model['usedlag'] = result[2] model['nobs'] = result[3] model['critical_values'] = result[4] if autolag is not None: model['icbest'] = result[5] model['_repr_brtc_'] = rb.get() return {'model': model}
def _kmeans_train_predict(table, input_cols, n_clusters=3, prediction_col='prediction', init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto', seed=None, n_jobs=1, algorithm='auto', n_samples=None): feature_names, inputarr = check_col_type(table, input_cols) if n_samples is None: n_samples = len(inputarr) k_means = SKKMeans(n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter, tol=tol, precompute_distances=precompute_distances, verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm) k_means.fit(inputarr) params = {'input_cols':feature_names, 'n_clusters':n_clusters, 'init':init, 'n_init':n_init, 'max_iter':max_iter, 'tol':tol, 'precompute_distances':precompute_distances, 'seed':seed, 'n_jobs':n_jobs, 'algorithm':algorithm, 'n_samples':n_samples} cluster_centers = k_means.cluster_centers_ n_clusters = len(cluster_centers) colors = cm.nipy_spectral(np.arange(n_clusters).astype(float) / n_clusters) labels = k_means.labels_ pca2_model = PCA(n_components=min(2, len(feature_names))).fit(inputarr) pca2 = pca2_model.transform(inputarr) fig_centers = _kmeans_centers_plot(feature_names, cluster_centers, colors) fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, cluster_centers, seed, colors) fig_pca = _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2, colors) rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Kmeans Result | - Number of iterations run: {n_iter_}. | - Sum of square error: {sse_}. | - Coordinates of cluster centers | {fig_cluster_centers} | - Samples | {fig_pca} | {fig_samples} | | ### Parameters | {params} """.format(n_iter_=k_means.n_iter_, sse_=k_means.inertia_, fig_cluster_centers=fig_centers, fig_pca=fig_pca, fig_samples=fig_samples, params=dict2MD(params)))) model = _model_dict('kmeans') model['model'] = k_means model['input_cols'] = input_cols model['_repr_brtc_'] = rb.get() out_table = table.copy() out_table[prediction_col] = labels return {'out_table':out_table, 'model':model}
def _tfidf_gensim(table, input_col, output_col_name="sparse_vectors", tf_weighing='n', df_weighing='t', document_normalization='c'): out_table = table.copy() tokens = out_table[input_col] smartirs = tf_weighing + df_weighing + document_normalization dictionary = Dictionary(tokens) word_count_vector_list = [dictionary.doc2bow(text) for text in tokens] tfidf_model = TfidfModel(word_count_vector_list, smartirs=smartirs) tfidf_vector_list = [*tfidf_model[word_count_vector_list]] sparse_matrix = corpus2csc(tfidf_vector_list, num_terms=len(dictionary.token2id)).T rb = BrtcReprBuilder() dictionary_data = [[ index, word, tfidf_model.dfs[index], tfidf_model.idfs[index] ] for index, word in dictionary.items()] dictionary_table = pd.DataFrame(data=dictionary_data, columns=['index', 'word', 'count', 'idf']) dictionary_table = dictionary_table.sort_values(["count"], ascending=[False]) rb.addMD( strip_margin(""" | ## TFIDF Result | ### Dictionary | {table1} """.format(table1=pandasDF2MD(dictionary_table)))) out_table[output_col_name] = csr_matrix_to_sparse_vector_json_list( sparse_matrix) model = _model_dict('tfidf_model') model['dictionary_table'] = dictionary_table model['dictionary'] = dictionary model['tfidf_model'] = tfidf_model model['input_col'] = input_col model['output_col_name'] = output_col_name model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'model': model}
def _ada_boost_classification_train(table, feature_cols, label_col, max_depth=1, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None): x_train = table[feature_cols] y_train = table[label_col] base_estimator = DecisionTreeClassifier(max_depth=max_depth) classifier = AdaBoostClassifier(base_estimator, n_estimators, learning_rate, algorithm, random_state) classifier.fit(x_train, y_train) params = {'feature_cols': feature_cols, 'label_col': label_col, 'feature_importance': classifier.feature_importances_, 'n_estimators': n_estimators, 'learning_rate': learning_rate, 'algorithm': algorithm, 'random_state': random_state} model = _model_dict('ada_boost_classification_model') get_param = classifier.get_params() model['parameters'] = get_param model['classifier'] = classifier model['params'] = params fig_feature_importance = _plot_feature_importance(feature_cols, classifier) params = dict2MD(get_param) rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## AdaBoost Classification Train Result | | ### Feature Importance | {fig_feature_importance} | | ### Parameters | {list_parameters} | """.format(fig_feature_importance=fig_feature_importance, list_parameters=params ))) model['_repr_brtc_'] = rb.get() feature_importance = classifier.feature_importances_ feature_importance_table = pd.DataFrame([[feature_cols[i], feature_importance[i]] for i in range(len(feature_cols))], columns=['feature_name', 'importance']) model['feature_importance_table'] = feature_importance_table return {'model': model}
def agglomerative_clustering_train_predict(input_table, input_cols, n_clusters=3, affinity='euclidean', compute_full_tree=True, linkage='ward', prediction_col='prediction', figw=6.4, figh=4.8): inputarr = input_table[input_cols] agglomerative_clustering = SKAgglomerativeClustering( n_clusters=n_clusters, affinity=affinity, memory=None, connectivity=None, compute_full_tree=compute_full_tree, linkage=linkage) agglomerative_clustering.fit(inputarr) input_table[prediction_col] = agglomerative_clustering.labels_ children = agglomerative_clustering.children_ distance = np.arange(children.shape[0]) no_of_observations = np.arange(2, children.shape[0] + 2) linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float) plt.figure(figsize=(figw, figh)) dendrogram(linkage_matrix) plot_dendrogram = plt2MD(plt) plt.clf() rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Agglomerative Clustering Result | {plot_dendrogram} """.format(plot_dendrogram=plot_dendrogram))) agglomerative_clustering_result = { 'model': agglomerative_clustering, 'input_cols': input_cols, '_repr_brtc_': rb.get() } return { 'out_table': input_table, 'agglomerative_result': agglomerative_clustering_result }
def _doc_term_mtx(table, model, input_col, result_type='doc_to_bow_token'): corpus = table[input_col].tolist() dictionary = model['dictionary'] bow_corpus = [] for doc in corpus: bow_corpus.append(dictionary.doc2bow(doc)) doc_to_bow = [] for i in range(len(corpus)): token_cnt = [] for j in range(len(bow_corpus[i])): token_cnt.append('({token}, {cnt})'.format( token=dictionary[bow_corpus[i][j][0]], cnt=bow_corpus[i][j][1])) doc_to_bow.append(token_cnt) doc_to_bow_list = [] for doc in doc_to_bow: doc_to_bow_list.append('{}'.format(list(doc))) doc_idx = ['doc_{}'.format(i) for i in range(len(corpus))] terms = [term for term in dictionary.token2id.keys()] if result_type == 'doc_to_bow_token': out_table = pd.DataFrame(data=doc_to_bow_list, columns=['doc_to_bow']) out_table.insert(loc=0, column='doc_idx', value=doc_idx) elif result_type == 'doc_term_mtx': out_table = pd.DataFrame( matutils.corpus2dense(bow_corpus, num_terms=len(dictionary.token2id)).T) out_table.insert(loc=0, column=' ', value=doc_idx) out_table.columns = np.append('', terms) elif result_type == 'term_doc_mtx': out_table = pd.DataFrame( matutils.corpus2dense(bow_corpus, num_terms=len(dictionary.token2id))) out_table.insert(loc=0, column=' ', value=terms) out_table.columns = np.append('', doc_idx) else: raise_runtime_error("Please check 'result_type'.") rb = BrtcReprBuilder() model = _model_dict('doc_term_mtx') model['bow_corpus'] = bow_corpus model['_repr_brtc_'] = rb.get() return {'out_table': out_table}
def _wilcoxon_test(table, response_col, factor_col, zero_method='wilcox', correction=False): result = dict() rb = BrtcReprBuilder() rb.addMD("""## Wilcoxon Test Result""") groups = dict() for name, group in table.groupby(factor_col): groups[name] = group for name1, name2 in itertools.combinations(groups.keys(), 2): stats, pval = wilcoxon(x=groups[name1][response_col], y=groups[name2][response_col], zero_method=zero_method, correction=correction) rb.addMD( strip_margin(""" | ## {name1} vs {name2} | | ### The sum of the ranks of the differences: {stats} | | ### The two-sided p-value for the test: {pval} """.format(name1=name1, name2=name2, stats=stats, pval=pval))) name = str(name1) + '_' + str(name2) result[name] = dict() result[name]['Statistics'] = stats result[name]['P value'] = pval result['_repr_brtc_'] = rb.get() return {'result': result}
def _kruskal_wallis_test(table, response_cols, factor_col, nan_policy='propagate'): result = dict() rb = BrtcReprBuilder() rb.addMD("""## Kruskal Wallis test Result""") groups = dict() for name, group in table.groupby(factor_col): groups[name] = group for response_col in response_cols: stats, pval = kruskal(*[x[response_col] for x in groups.values()]) rb.addMD( strip_margin(""" | ## {response_col} by {factor_col} | | ### Statistics value: {stats} | | ### P value: {pval} """.format(response_col=response_col, factor_col=factor_col, stats=stats, pval=pval))) name = response_col + '_' + factor_col result[name] = dict() result[name]['Statistics'] = stats result[name]['P value'] = pval result['_repr_brtc_'] = rb.get() return {'result': result}
def _mann_whitney_test(table, response_col, factor_col, use_continuity=True): result = dict() rb = BrtcReprBuilder() rb.addMD("""## Mann Whitney test Result""") groups = dict() uniq_factor = table[factor_col].unique() for name in uniq_factor: groups[name] = np.array(table[response_col])[np.where(table[factor_col] == name)] group_name = [] stats = [] pvals = [] for name1, name2 in itertools.combinations(uniq_factor, 2): name = str(name1) + ' vs ' + str(name2) stat, pval = mannwhitneyu(groups[name1], groups[name2], use_continuity=use_continuity) group_name.append(name) stats.append(stat) pvals.append(pval) result[name] = dict() result[name]['Statistics'] = stat result[name]['P value'] = pval rb.addMD(strip_margin(""" | {table} """.format(table=pandasDF2MD(pd.DataFrame({'': group_name, 'Test Statistics': stats, 'P Value': pvals}))))) result['_repr_brtc_'] = rb.get() return {'result': result}
def _ljung_box_test(table, input_cols, lags=None): result = dict() rb = BrtcReprBuilder() rb.addMD("""## Ljung Box test Result""") for input_col in input_cols: lbvalue, pvalue = acorr_ljungbox(x=table[input_col], lags=lags) lb_res = dict() lb_res['lags'] = range(1, len(lbvalue) + 1) lb_res['test statistic'] = lbvalue lb_res['p-value based on chi-square distribution'] = pvalue lb_res = pd.DataFrame(lb_res) rb.addMD( strip_margin(""" | ## {input_col} test result | | {lb_res} """.format(input_col=input_col, lb_res=pandasDF2MD(lb_res, num_rows=lb_res.shape[0])))) result[input_col] = lb_res result['_repr_brtc_'] = rb.get() return {'result': result}
def _kruskal_wallis_test(table, response_cols, factor_col, nan_policy='propagate'): result = dict() rb = BrtcReprBuilder() rb.addMD("""## Kruskal Wallis test Result""") groups = dict() for name, group in table.groupby(factor_col): groups[name] = group group_name = [] df = [len(groups) - 1] * len(response_cols) stats = [] pvals = [] for response_col in response_cols: stat, pval = kruskal(*[x[response_col] for x in groups.values()]) group_name.append(response_col + ' by ' + factor_col) stats.append(stat) pvals.append(pval) name = response_col + '_' + factor_col result[name] = dict() result[name]['Statistics'] = stat result[name]['P value'] = pval rb.addMD(strip_margin(""" | {table} """.format(table=pandasDF2MD(pd.DataFrame({'': group_name, 'Degree of Freedom': df, 'Test Statistics': stats, 'P value': pvals}))))) result['_repr_brtc_'] = rb.get() return {'result': result}
def _function_by_group2(function, table=None, model=None, columns=None, group_by=None, **params): if isinstance(model, dict) and '_grouped_data' not in model: raise Exception('Unsupported model. model requires _grouped_data.') if isinstance(model, dict): groups = model['_grouped_data']['groups'] group_by = model['_grouped_data']['group_by'] if isinstance(table, pd.DataFrame): table, groups = _group( table, params, group_by) # use group keys from table even there is a model. sample_result = _sample_result(function, table, model, params, groups) res_keys, df_keys, model_keys_containing_repr = _info_from_sample_result( sample_result, group_by, groups) res_dict, success_keys = _function_by_group_key(function, table, model, params, groups, res_keys, group_by) for repr_key in model_keys_containing_repr: rb = BrtcReprBuilder() for group in success_keys: rb.addMD( '--- \n\n ### Group by {group_by} : {tmp_group}\n\n---'.format( group_by=group_by, tmp_group=group)) rb.merge(res_dict[repr_key]['_grouped_data']['data'][tuple(group)] ['_repr_brtc_']) res_dict[repr_key]['_repr_brtc_'] = rb.get() for df_key in df_keys: res_dict[df_key] = _flatten(res_dict[df_key], groups, group_by, columns) return res_dict
def _mann_whitney_test(table, response_col, factor_col, use_continuity=True): result = dict() rb = BrtcReprBuilder() rb.addMD("""## Mann Whitney test Result""") groups = dict() uniq_factor = table[factor_col].unique() for name in uniq_factor: groups[name] = np.array( table[response_col])[np.where(table[factor_col] == name)] for name1, name2 in itertools.combinations(uniq_factor, 2): stats, pval = mannwhitneyu(groups[name1], groups[name2], use_continuity=use_continuity) rb.addMD( strip_margin(""" | ## {name1} vs {name2} | | ### Statistics U value: {stats} | | ### P value: {pval} """.format(name1=name1, name2=name2, stats=stats, pval=pval))) name = str(name1) + '_' + str(name2) result[name] = dict() result[name]['Statistics'] = stats result[name]['P value'] = pval result['_repr_brtc_'] = rb.get() return {'result': result}
def _wilcoxon_test2(table, first_col, second_col, zero_method='wilcox', correction=False): result = dict() rb = BrtcReprBuilder() rb.addMD("""## Wilcoxon Test Result""") alter_hypothesis = [] stats = [] pvals = [] stat, pval = wilcoxon(x=table[first_col], y=table[second_col], zero_method=zero_method, correction=correction) alter_hypothesis.append('Median of the differences != 0') stats.append(stat) pvals.append(pval) result_table = pd.DataFrame({ 'Alternative hypothesis': alter_hypothesis, 'Sum of differences ranks': stats, 'P-value': pvals }) rb.addMD( strip_margin(""" | {table} """.format(table=pandasDF2MD(result_table)))) result['_repr_brtc_'] = rb.get() return {'result': result}
def _outlier_detection_lof(table, input_cols, n_neighbors=20, result_type='add_prediction', new_column_name='is_outlier'): out_table = table.copy() features = out_table[input_cols] lof_model = LocalOutlierFactor(n_neighbors, algorithm='auto', leaf_size=30, metric='minkowski', p=2, novelty=True, contamination=0.1) lof_model.fit(features) isinlier = lambda _: 'in' if _ == 1 else 'out' out_table[new_column_name] = [isinlier(lof_predict) for lof_predict in lof_model.predict(features)] if result_type == 'add_prediction': pass elif result_type == 'remove_outliers': out_table = out_table[out_table[new_column_name] == 'in'] out_table = out_table.drop(new_column_name, axis=1) elif result_type == 'both': out_table = out_table[out_table[new_column_name] == 'in'] else: raise_runtime_error("Please check 'result_type'.") params = { 'Input Columns': input_cols, 'Result Type': result_type, 'Number of Neighbors': n_neighbors, } rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Outlier Detection (Local Outlier Factor) Result | ### Parameters | | {display_params} | """.format(display_params=dict2MD(params)))) model = _model_dict('outlier_detection_lof') model['params'] = params model['lof_model'] = lof_model model['input_cols'] = input_cols model['result_type'] = result_type model['num_neighbors'] = n_neighbors model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'model': model}
def _pls_regression_train(table, feature_cols, label_cols, n_components=2, scale=True, max_iter=500, tol=1e-6): pls_model = PLS(n_components=n_components, scale=scale, max_iter=max_iter, tol=tol) _, features = check_col_type(table, feature_cols) _, labels = check_col_type(table, label_cols) pls_model.fit(features, labels) predict = pls_model.predict(features) _mean_absolute_error = mean_absolute_error(labels, predict) _mean_squared_error = mean_squared_error(labels, predict) _r2_score = r2_score(labels, predict) result_table = pd.DataFrame.from_items([ ['Metric', ['Mean Absolute Error', 'Mean Squared Error', 'R2 Score']], ['Score', [_mean_absolute_error, _mean_squared_error, _r2_score]] ]) label_name = { 'n_components': 'Number of components', 'scale': "Scale", 'max_iter': 'Max iteration', 'tol': 'Tolerance' } get_param = pls_model.get_params() param_table = pd.DataFrame.from_items([ ['Parameter', list(label_name.values())], ['Value', [get_param[x] for x in list(label_name.keys())]] ]) rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ### PLS Regression Result | {result} | ### Parameters | {list_parameters} """.format(result=pandasDF2MD(result_table), list_parameters=pandasDF2MD(param_table) ))) model = _model_dict('pls_regression_model') model['feature_cols'] = feature_cols model['label'] = label_cols model['mean_absolute_error'] = _mean_absolute_error model['mean_squared_error'] = _mean_squared_error model['r2_score'] = _r2_score model['max_iter'] = max_iter model['tol'] = tol model['pls_model'] = pls_model model['_repr_brtc_'] = rb.get() return {'model': model}
def _autocorrelation(table, input_col, nlags=20, conf_level=0.95): data = table[input_col] plt.figure() plot_acf(data, lags=nlags, alpha=1 - conf_level) fig_plt_acf = plt2MD(plt) plt.clf() plt.figure() plot_pacf(data, lags=nlags, alpha=1 - conf_level) fig_plt_pacf = plt2MD(plt) plt.clf() acf_ret = acf(data, nlags=nlags, alpha=1-conf_level) pacf_ret = pacf(data, nlags=nlags, alpha=1-conf_level) result_table1 = pd.DataFrame([]) result_table1['lag'] = list(range(nlags + 1)) result_table1['ACF'] = acf_ret[0] if conf_level is not None: result_table1['%g%% confidence Interval' % (conf_level * 100)] = [str((acf_ret[1][i][0], acf_ret[1][i][1])) for i in range(nlags + 1)] result_table2 = pd.DataFrame([]) result_table2['lag'] = list(range(nlags + 1)) result_table2['PACF'] = pacf_ret[0] if conf_level is not None: result_table2['%g%% confidence Interval' % (conf_level * 100)] = [str((pacf_ret[1][i][0], pacf_ret[1][i][1])) for i in range(nlags + 1)] rb = BrtcReprBuilder() rb.addMD(strip_margin("""# Autocorrelation / Partial Autocorrelation Result""")) rb.addMD(strip_margin(""" |## Autocorrelation | |{image1} | |### Autocorrelation Table | |{result_table1} | |## Partial Autocorrelation | |{image2} | |### Partial Autocorrelation Table | |{result_table2} | """.format(image1=fig_plt_acf, result_table1=pandasDF2MD(result_table1, num_rows=nlags + 1), image2=fig_plt_pacf, result_table2=pandasDF2MD(result_table2, num_rows=nlags + 1)))) model = _model_dict('autocorrelation') model['autocorrelation_table'] = result_table1 model['partial_autocorrelation_table'] = result_table2 model['_repr_brtc_'] = rb.get() return {'model':model}
def _hierarchical_clustering_post(table, model, num_clusters, cluster_col='prediction'): Z = model['model'] mode = model['input_mode'] if mode == 'matrix': distance_matrix = model['dist_matrix'] out_table = model['linkage_matrix'] predict = fcluster(Z, t=num_clusters, criterion='maxclust') if mode == 'original': prediction_table = table.copy() elif mode == 'matrix': prediction_table = distance_matrix prediction_table[cluster_col] = predict L, M = leaders(Z, predict) which_cluster = [] for leader in L: if leader in Z[:, 0]: select_indices = np.where(Z[:, 0] == leader)[0][0] which_cluster.append(out_table['joined_column1'][select_indices]) elif leader in Z[:, 1]: select_indices = np.where(Z[:, 1] == leader)[0][0] which_cluster.append(out_table['joined_column2'][select_indices]) clusters_info_table = pd.DataFrame([]) clusters_info_table[cluster_col] = M clusters_info_table['name_of_clusters'] = which_cluster clusters_info_table = clusters_info_table.sort_values(cluster_col) cluster_count = np.bincount(prediction_table[cluster_col]) cluster_count = cluster_count[cluster_count != 0] clusters_info_table['num_of_entities'] = list(cluster_count) rb = BrtcReprBuilder() rb.addMD( strip_margin("""### Hierarchical Clustering Post Process Result""")) rb.addMD( strip_margin(""" |### Parameters | |{display_params} | |## Clusters Information | |{clusters_info_table} | """.format(display_params=dict2MD(model['parameters']), clusters_info_table=pandasDF2MD(clusters_info_table)))) model = _model_dict('hierarchical_clustering_post') model['clusters_info'] = clusters_info_table model['_repr_brtc_'] = rb.get() return {'out_table': prediction_table, 'model': model}
def _plot_roc_pr_curve(table, label_col, probability_col, fig_w=6.4, fig_h=4.8, pos_label=None): label = table[label_col] probability = table[probability_col] threshold, fig_tpr_fpr, fig_roc, fig_precision_recall, fig_pr, fig_confusion = \ _plot_binary(label, probability, fig_size=(fig_w, fig_h), pos_label=pos_label) summary = dict() summary['threshold'] = threshold summary['label_col'] = label_col summary['probability_col'] = probability_col rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Plot ROC Curve and PR Curve Result | | ### ROC Curve | {fig_tpr_fpr} | {fig_roc} | | ### PR Curve | {fig_precision_recall} | {fig_pr} | | ### Confusion Matrix | {fig_confusion} """.format(fig_roc=fig_roc, fig_tpr_fpr=fig_tpr_fpr, fig_pr=fig_pr, fig_precision_recall=fig_precision_recall, fig_confusion=fig_confusion))) summary['_repr_brtc_'] = rb.get() return {'result': summary}
def _pairplot(table, x_vars, y_vars=None, kind='scatter', diag_kind='auto', markers=None, palette=None, height=2.5, aspect=1, dropna=True, hue=None): validate(greater_than(height, 0, 'height'), greater_than(aspect, 0, 'aspect')) s_default = plt.rcParams['lines.markersize']**2. plot_kws = {"s": s_default * height / 6.4} if y_vars is None: y_vars = x_vars if kind == 'scatter': g = sns.pairplot(table, x_vars=x_vars, y_vars=y_vars, kind=kind, diag_kind=diag_kind, markers=markers, height=height, aspect=aspect, \ dropna=dropna, hue=hue, palette=palette, plot_kws=plot_kws) else: scatter_kws = {'scatter_kws': plot_kws} g = sns.pairplot(table, x_vars=x_vars, y_vars=y_vars, kind=kind, diag_kind=diag_kind, markers=markers, height=height, aspect=aspect, \ dropna=dropna, hue=hue, palette=palette, plot_kws=scatter_kws) if height <= 2.5: for ax in g.axes.flatten(): for label in ax.get_xticklabels(): label.set_rotation(90 * (2.5 - height)) rb = BrtcReprBuilder() rb.addPlt(plt) plt.clf() return {'result': {'_repr_brtc_': rb.get()}}
def _term_term_mtx(table, model, input_col, result_type='sparse'): corpus = table[input_col].tolist() dictionary = model['dictionary'] bow_corpus = [] for doc in corpus: bow_corpus.append(dictionary.doc2bow(doc)) csr_matrix = matutils.corpus2csc(bow_corpus).T csr_matrix.data = np.array([1 for _ in range(len(csr_matrix.data))]) term_term = (csr_matrix.T @ csr_matrix).tocoo() if result_type == 'sparse': term_term = sparse.triu(term_term, k=1) out_table = pd.DataFrame([dictionary[i] for i in term_term.row], columns=['term1']) out_table['term2'] = [dictionary[j] for j in term_term.col] out_table['number_of_documents_containing_terms'] = term_term.data elif result_type == 'dense': if model['add_words'] is None: model['add_words'] = [] num_origin = len(dictionary) - len(model['add_words']) terms = [term for term in dictionary.token2id.keys()][:num_origin] doc_idx = ['doc_{}'.format(i) for i in range(len(corpus))] out_table = pd.DataFrame(term_term.todense()) out_table.insert(loc=0, column=' ', value=terms) out_table.columns = np.append(" ", terms) else: raise_runtime_error("Please check 'result_type'.") rb = BrtcReprBuilder() model = _model_dict('term_term_mtx') model['term_term_mtx'] = term_term model['_repr_brtc_'] = rb.get() return {'out_table': out_table}