def _xgb_regression_train(table, feature_cols, label_col, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objectibe='reg:linear', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=None, seed=None, missing=None, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None, sample_weight_eval_set=None, importance_type='gain'): if random_state is None: random_state = randint(-2**31, 2**31 - 1) regressor = XGBRegressor(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, silent=silent, objective=objectibe, booster=booster, n_jobs=n_jobs, nthread=nthread, gamma=gamma, min_child_weight=min_child_weight, max_delta_step=max_delta_step, subsample=subsample, colsample_bytree=colsample_bytree, colsample_bylevel=colsample_bylevel, reg_alpha=reg_alpha, reg_lambda=reg_lambda, scale_pos_weight=scale_pos_weight, base_score=base_score, random_state=random_state, seed=seed, missing=missing, importance_type=importance_type) feature_names, features = check_col_type(table, feature_cols) label = table[label_col] regressor.fit(features, label, sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set) # json get_param = regressor.get_params() feature_importance = regressor.feature_importances_ # plt.rcdefaults() plot_importance(regressor) plt.tight_layout() fig_plot_importance = plt2MD(plt) plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(regressor) # fig_plot_tree_UT = plt2MD(plt) # plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(regressor, rankdir='LR') # fig_plot_tree_LR = plt2MD(plt) # plt.rcdefaults() # plt.clf() out_model = _model_dict('xgb_regression_model') out_model['feature_cols'] = feature_cols out_model['label_col'] = label_col out_model['parameters'] = get_param out_model['feature_importance'] = feature_importance out_model['regressor'] = regressor out_model['plot_importance'] = fig_plot_importance # out_model['plot_tree_UT'] = fig_plot_tree_UT # out_model['plot_tree_LR'] = fig_plot_tree_LR # out_model['to_graphviz'] = md_to_graphviz # report get_param_list = [] get_param_list.append(['feature_cols', feature_names]) get_param_list.append(['label_col', label_col]) for key, value in get_param.items(): temp = [key, value] get_param_list.append(temp) get_param_df = pd.DataFrame(data=get_param_list, columns=['parameter', 'value']) feature_importance_df = pd.DataFrame(data=feature_importance, index=feature_names).T rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## XGB Regression Result | | ### Plot Feature Importance | {image_importance} | | ### Normalized Feature Importance Table | {table_feature_importance} | | ### Parameters | {table_parameter} | """.format(image_importance=fig_plot_importance, table_feature_importance=pandasDF2MD(feature_importance_df, 20), table_parameter=pandasDF2MD(get_param_df)))) out_model['_repr_brtc_'] = rb.get() feature_importance_table = pd.DataFrame( [[feature_cols[i], feature_importance[i]] for i in range(len(feature_cols))], columns=['feature_name', 'importance']) out_model['feature_importance_table'] = feature_importance_table return {'model': out_model}
def _hierarchical_clustering(table, input_cols, input_mode, key_col=None, link='complete', met='euclidean', num_rows=20, figure_height=6.4, orient='right'): out_table = table.copy() features = out_table[input_cols] if input_mode == 'original': len_features = len(features) if key_col != None: data_names = list(out_table[key_col]) elif key_col == None: data_names = ['pt_' + str(i) for i in range(len_features)] Z = linkage(features, method=link, metric=met) elif input_mode == 'matrix': len_features = len(input_cols) if key_col != None: data_names = [] for column in input_cols: data_names.append( out_table[key_col][out_table.columns.get_loc(column)]) elif key_col == None: data_names = [] for column in input_cols: data_names.append( out_table.columns[out_table.columns.get_loc(column)]) col_index = [] for column in input_cols: col_index.append(out_table.columns.get_loc(column)) dist_matrix = features.iloc[col_index] Z = linkage(dist_matrix, method=link, metric=met) dist_matrix['label'] = data_names else: raise_runtime_error("Please check 'input_mode'.") range_len_Z = range(len(Z)) linkage_matrix = pd.DataFrame([]) linkage_matrix['linkage_step'] = [x + 1 for x in reversed(range_len_Z)] linkage_matrix['name_of_clusters'] = [ 'CL_' + str(i + 1) for i in reversed(range_len_Z) ] joined_column1 = [] for i in range_len_Z: if Z[:, 0][i] < len_features: joined_column1.append(data_names[int(Z[:, 0][i])]) elif Z[:, 0][i] >= len_features: joined_column1.append( linkage_matrix['name_of_clusters'][Z[:, 0][i] - len_features]) linkage_matrix['joined_column1'] = joined_column1 joined_column2 = [] for i in range_len_Z: if Z[:, 1][i] < len_features: joined_column2.append(data_names[int(Z[:, 1][i])]) elif Z[:, 1][i] >= len_features: joined_column2.append( linkage_matrix['name_of_clusters'][Z[:, 1][i] - len_features]) linkage_matrix['joined_column2'] = joined_column2 linkage_matrix['distance'] = [distance for distance in Z[:, 2]] linkage_matrix['number_of_original'] = [ int(entities) for entities in Z[:, 3] ] linkage_matrix = linkage_matrix.reindex( index=linkage_matrix.index[::-1])[0:] # calculate full dendrogram def _llf(idx): if idx < len_features: return 'pt_' + str(idx) plt.figure(figsize=(8.4, figure_height)) _fancy_dendrogram( Z, truncate_mode= 'none', # show only the last p merged clusters (if another) get_leaves=True, orientation=orient, labels=data_names, # leaf_label_func=_llf, leaf_rotation=45, leaf_font_size=5., show_contracted= False, # to get a distribution impression in truncated branches annotate_above=float( 10), # useful in small plots so annotations don't overlap ) plt.title('Hierarchical Clustering Dendrogram') if orient == 'top': plt.xlabel('Samples') plt.ylabel('Distance') elif orient == 'right': plt.xlabel('Distance') plt.ylabel('Samples') plt2 = plt2MD(plt) plt.clf() params = { 'Input Columns': input_cols, 'Linkage Method': link, 'Metric': met, 'Number of Rows in Linkage Matrix': num_rows } rb = BrtcReprBuilder() rb.addMD(strip_margin("""### Hierarchical Clustering Result""")) rb.addMD( strip_margin(""" |## Dendrogram | |{image} | |### Parameters | | {display_params} | |## Linkage Matrix | |{out_table1} | """.format(image=plt2, display_params=dict2MD(params), out_table1=pandasDF2MD(linkage_matrix.head(num_rows))))) model = _model_dict('hierarchical_clustering') model['model'] = Z model['input_mode'] = input_mode if input_mode == 'matrix': model['dist_matrix'] = dist_matrix model['parameters'] = params model['linkage_matrix'] = linkage_matrix model['_repr_brtc_'] = rb.get() return {'model': model}
def _als_train(table, user_col, item_col, rating_col, mode='train', number=10, implicit=False, iterations=10, reg_param=0.1, rank=10, alpha=1.0, seed=None, targets=None): table_user_col = table[user_col] table_item_col = table[item_col] rating_col = table[rating_col] user_encoder = preprocessing.LabelEncoder() item_encoder = preprocessing.LabelEncoder() user_encoder.fit(table_user_col) item_encoder.fit(table_item_col) user_correspond = user_encoder.transform(table_user_col) item_correspond = item_encoder.transform(table_item_col) item_users = np.zeros( (len(item_encoder.classes_), len(user_encoder.classes_))) for i in range(len(table_user_col)): if implicit: item_users[item_correspond[i]][user_correspond[i]] = rating_col[i] else: if rating_col[i] == 0: item_users[item_correspond[i]][user_correspond[i]] = -1 else: item_users[item_correspond[i]][ user_correspond[i]] = rating_col[i] item_users = csr_matrix(item_users) als_model = AlternatingLeastSquares(factors=rank, implicit=implicit, iterations=iterations, regularization=reg_param, alpha=alpha, seed=seed) als_model.fit(item_users) tmp_col = list(als_model.user_factors) for i in range(len(tmp_col)): tmp_col[i] = list(tmp_col[i]) user_factors = pd.DataFrame(user_encoder.classes_, columns=[user_col]) user_factors['features'] = tmp_col tmp_col = list(als_model.item_factors) for i in range(len(tmp_col)): tmp_col[i] = list(tmp_col[i]) item_factors = pd.DataFrame(item_encoder.classes_, columns=[item_col]) item_factors['features'] = tmp_col if mode == 'Topn': if targets is None: targets = user_encoder.classes_ targets_en = user_encoder.transform(targets) user_items = item_users.T.tocsr() Topn_result = [] for user in targets_en: recommendations_corre = als_model.recommend( user, user_items, number) recommendations = [] for (item, rating) in recommendations_corre: recommendations += [ item_encoder.inverse_transform([item])[0], rating ] Topn_result += [recommendations] Topn_result = pd.DataFrame(Topn_result) Topn_result = pd.concat([pd.DataFrame(targets), Topn_result], axis=1, ignore_index=True) column_names = ['user'] for i in range(number): column_names += ['item_top%d' % (i + 1), 'rating_top%d' % (i + 1)] Topn_result.columns = column_names return {'out_table': Topn_result} parameters = dict() parameters['Iterations'] = iterations parameters['Reg Param'] = reg_param parameters['Seed'] = seed parameters['Rank'] = rank if implicit: parameters['alpha'] = alpha rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## ALS Train Result | | ### Parameters | {parameters} | ### Item Factors | {item_factors} | ### User Factors | {user_factors} | """.format(item_factors=pandasDF2MD(item_factors, num_rows=item_users.shape[0]), user_factors=pandasDF2MD(user_factors, num_rows=item_users.shape[1]), parameters=dict2MD(parameters)))) model = _model_dict('ALS') model['als_model'] = als_model model['item_encoder'] = item_encoder model['user_encoder'] = user_encoder model['user_col'] = user_col model['item_col'] = item_col model['user_factors'] = user_factors model['item_factors'] = item_factors model['_repr_brtc_'] = rb.get() return {'model': model}
def _lda(table, input_col, num_voca=1000, num_topic=3, num_topic_word=3, max_iter=20, learning_method='online', learning_offset=10., random_state=None): corpus = table[input_col] tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_voca, stop_words='english') term_count = tf_vectorizer.fit_transform(corpus) tf_feature_names = tf_vectorizer.get_feature_names() if learning_method == 'online': lda_model = LatentDirichletAllocation( n_components=num_topic, max_iter=max_iter, learning_method=learning_method, learning_offset=learning_offset, random_state=random_state).fit(term_count) elif learning_method == 'batch': lda_model = LatentDirichletAllocation( n_components=num_topic, max_iter=max_iter, learning_method=learning_method, random_state=random_state).fit(term_count) else: raise_runtime_error("Please check 'learning_method'.") topic_model = pd.DataFrame([]) topic_idx_list = [] voca_weights_list = [] for topic_idx, weights in enumerate(lda_model.components_): topic_idx_list.append("Topic {}".format(topic_idx)) pairs = [] for term_idx, value in enumerate(weights): pairs.append((abs(value), tf_feature_names[term_idx])) pairs.sort(key=lambda x: x[0], reverse=True) voca_weights = [] for pair in pairs[:num_topic_word]: voca_weights.append("{}: {}".format(pair[1], pair[0])) voca_weights_list.append(voca_weights) topic_model['topic idx'] = topic_idx_list topic_model['topic vocabularies'] = voca_weights_list doc_topic = lda_model.transform(term_count) doc_classification = pd.DataFrame() doc_classification['documents'] = [doc for doc in corpus] doc_classification['top topic'] = [ "Topic {}".format(doc_topic[i].argmax()) for i in range(len(corpus)) ] params = { 'Input Column': input_col, 'Number of Vocabularies': num_voca, 'Number of Topics': num_topic, 'Number of Terminologies': num_topic_word, 'Iterations': max_iter, 'Learning Method': learning_method, } rb = BrtcReprBuilder() rb.addMD(strip_margin("""# Latent Dirichlet Allocation Result""")) rb.addMD( strip_margin(""" | |### Parameters | | {display_params} | |### Topic Model | |{topic_model} | |### Documents Classification | |{doc_classification} | """.format(display_params=dict2MD(params), topic_model=pandasDF2MD(topic_model, num_rows=num_topic + 1), doc_classification=pandasDF2MD(doc_classification, num_rows=len(corpus) + 1)))) model = _model_dict('lda') model['parameter'] = params model['topic_model'] = topic_model model['documents_classification'] = doc_classification model['_repr_brtc_'] = rb.get() return {'model': model}
def _evaluate_classification(table, label_col, prediction_col): label = table[label_col] predict = table[prediction_col] # compute metrics accuracy = accuracy_score(label, predict) f1 = f1_score(label, predict, average="weighted") precision = precision_score(label, predict, average="weighted") recall = recall_score(label, predict, average="weighted") class_names = np.unique(np.union1d(label.values, predict.values)) # Plot non-normalized confusion matrix plt.figure() _plot_confusion_matrix(label, predict, classes=class_names, title='Confusion matrix, without normalization') fig_cnf_matrix = plt2MD(plt) # Plot normalized confusion matrix plt.figure() _plot_confusion_matrix(label, predict, classes=class_names, normalize=True, title='Normalized confusion matrix') fig_cnf_matrix_normalized = plt2MD(plt) plt.clf() # json summary = dict() summary['label_col'] = label_col summary['prediction_col'] = prediction_col summary['f1_score'] = f1 summary['accuracy_score'] = accuracy summary['precision_score'] = precision summary['recall_score'] = recall # report all_dict_list = [{ 'f1': f1, 'accuracy': accuracy, 'precision': precision, 'recall': recall }] all_df = pd.DataFrame(all_dict_list) all_df = all_df[['f1', 'accuracy', 'precision', 'recall']] summary['metrics'] = all_df rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Evaluate Classification Result | ### Metrics | {table1} | | ### Confusion matrix | {fig_confusion_matrix} | | {fig_confusion_matrix_normalized} | """.format(table1=pandasDF2MD(all_df), fig_confusion_matrix=fig_cnf_matrix, fig_confusion_matrix_normalized=fig_cnf_matrix_normalized))) summary['_repr_brtc_'] = rb.get() return {'result': summary}
def _xgb_classification_train(table, feature_cols, label_col, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective='binary:logistic', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, importance_type='gain', class_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None, sample_weight_eval_set=None): y_train = table[label_col] class_labels = sorted(set(y_train)) if class_weight is None: sample_weight = None else: if len(class_weight) != len(class_labels): raise ValueError( "Number of class weights should match number of labels.") else: class_weight = { class_labels[i]: class_weight[i] for i in range(len(class_labels)) } sample_weight = np.vectorize(_make_sample_weight)(y_train, class_weight) classifier = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, silent=silent, objective=objective, booster=booster, n_jobs=n_jobs, nthread=nthread, gamma=gamma, min_child_weight=min_child_weight, max_delta_step=max_delta_step, subsample=subsample, colsample_bytree=colsample_bytree, colsample_bylevel=colsample_bylevel, reg_alpha=reg_alpha, reg_lambda=reg_lambda, scale_pos_weight=scale_pos_weight, base_score=base_score, random_state=random_state, seed=seed, missing=missing, importance_type=importance_type) classifier.fit(table[feature_cols], table[label_col], sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set) # json get_param = classifier.get_params() feature_importance = classifier.feature_importances_ # plt.rcdefaults() plot_importance(classifier) plt.tight_layout() fig_plot_importance = plt2MD(plt) plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(classifier) # fig_plot_tree_UT = plt2MD(plt) # plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(classifier, rankdir='LR') # fig_plot_tree_LR = plt2MD(plt) # plt.rcdefaults() # plt.clf() model = _model_dict('xgb_classification_model') model['feature_cols'] = feature_cols model['label_col'] = label_col model['parameters'] = get_param model['feature_importance'] = feature_importance model['classifier'] = classifier # report # get_param_list = [] # get_param_list.append(['feature_cols', feature_cols]) # get_param_list.append(['label_col', label_col]) params = dict2MD(get_param) # for key, value in get_param.items(): # temp = [key, value] # get_param_list.append(temp) # get_param_df = pd.DataFrame(data=get_param_list, columns=['parameter', 'value']) feature_importance_df = pd.DataFrame(data=feature_importance, index=feature_cols).T rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## XGB Classification Train Result | | ### Plot Feature Importance | {fig_importance} | | ### Normalized Feature Importance Table | {table_feature_importance} | | ### Parameters | {list_parameters} | """.format(fig_importance=fig_plot_importance, table_feature_importance=pandasDF2MD(feature_importance_df, 20), list_parameters=params))) model['_repr_brtc_'] = rb.get() feature_importance_table = pd.DataFrame( [[feature_cols[i], feature_importance[i]] for i in range(len(feature_cols))], columns=['feature_name', 'importance']) model['feature_importance_table'] = feature_importance_table return {'model': model}
def _autocorrelation(table, input_col, nlags=20, conf_level=0.95): data = table[input_col] plt.figure() plot_acf(data, lags=nlags, alpha=1 - conf_level) fig_plt_acf = plt2MD(plt) plt.clf() plt.figure() plot_pacf(data, lags=nlags, alpha=1 - conf_level) fig_plt_pacf = plt2MD(plt) plt.clf() acf_ret = acf(data, nlags=nlags, alpha=1 - conf_level) pacf_ret = pacf(data, nlags=nlags, alpha=1 - conf_level) result_table1 = pd.DataFrame([]) result_table1['lag'] = list(range(nlags + 1)) result_table1['ACF'] = acf_ret[0] if conf_level is not None: result_table1['%g%% confidence Interval' % (conf_level * 100)] = [ str((acf_ret[1][i][0], acf_ret[1][i][1])) for i in range(nlags + 1) ] result_table2 = pd.DataFrame([]) result_table2['lag'] = list(range(nlags + 1)) result_table2['PACF'] = pacf_ret[0] if conf_level is not None: result_table2['%g%% confidence Interval' % (conf_level * 100)] = [ str((pacf_ret[1][i][0], pacf_ret[1][i][1])) for i in range(nlags + 1) ] rb = BrtcReprBuilder() rb.addMD( strip_margin("""# Autocorrelation / Partial Autocorrelation Result""")) rb.addMD( strip_margin(""" |## Autocorrelation | |{image1} | |### Autocorrelation Table | |{result_table1} | |## Partial Autocorrelation | |{image2} | |### Partial Autocorrelation Table | |{result_table2} | """.format(image1=fig_plt_acf, result_table1=pandasDF2MD(result_table1, num_rows=nlags + 1), image2=fig_plt_pacf, result_table2=pandasDF2MD(result_table2, num_rows=nlags + 1)))) model = _model_dict('autocorrelation') model['autocorrelation_table'] = result_table1 model['partial_autocorrelation_table'] = result_table2 model['_repr_brtc_'] = rb.get() return {'model': model}
def _gaussian_mixture_train(table, input_cols, number_of_components=1, covariance_type='full', tolerance=0.001, \ regularize_covariance=1e-06, max_iteration=100, initial_params='kmeans', seed=None): gmm = GaussianMixture(n_components=number_of_components, covariance_type=covariance_type, tol=tolerance, \ reg_covar=regularize_covariance, max_iter=max_iteration, init_params=initial_params, random_state=seed) feature_names, X_train = check_col_type(table, input_cols) gmm.fit(X_train) out_table = pd.DataFrame() comp_num_arr = [] for i in range(0, number_of_components): comp_num_arr.append(i) mean_arr = [] for i in range(0, number_of_components): mean_arr.append(gmm.means_[i].tolist()) covar_arr = [] for i in range(0, number_of_components): covar_arr.append(gmm.covariances_[i].tolist()) out_table['component_number'] = comp_num_arr out_table['weight'] = gmm.weights_ out_table['mean_coordinate'] = mean_arr out_table['covariance_matrix'] = covar_arr rb = BrtcReprBuilder() params = { 'Input Columns': feature_names, 'Number of Components': number_of_components, 'Covariance Type': covariance_type, 'Tolerance': tolerance, 'Regularization of Covariance': regularize_covariance, 'Number of Iteration': max_iteration, 'Method to Initialize': initial_params } rb.addMD( strip_margin(""" |## Gaussian Mixture Train Result | |### Parameters | | {params} | |### Summary | |{result_table} | """.format(params=dict2MD(params), result_table=pandasDF2MD(out_table)))) model = _model_dict('gaussian_mixture_train') model['input_cols'] = input_cols model['number_of_components'] = number_of_components model['covariance_type'] = covariance_type model['tolerance'] = tolerance model['regularize_covariance'] = regularize_covariance model['max_iteration'] = max_iteration model['initial_params'] = initial_params model['seed'] = seed model['summary'] = out_table model['gmm'] = gmm model['_repr_brtc_'] = rb.get() return {'model': model}
def _mean_shift(table, input_cols, prediction_col='prediction', bandwidth=None, bin_seeding=False, min_bin_freq=1, cluster_all=True): inputarr = table[input_cols] ms = MeanShift(bandwidth=bandwidth, bin_seeding=bin_seeding, min_bin_freq=min_bin_freq, cluster_all=cluster_all, n_jobs=1) ms.fit(inputarr) label_name = { 'bandwidth': 'Bandwidth', 'bin_seeding': 'Bin Seeding', 'min_bin_freq': 'Minimum Bin Frequency', 'cluster_all': 'Cluster All'} get_param = ms.get_params() param_table = pd.DataFrame.from_items([ ['Parameter', list(label_name.values())], ['Value', [get_param[x] for x in list(label_name.keys())]] ]) cluster_centers = ms.cluster_centers_ n_clusters = len(cluster_centers) colors = cm.nipy_spectral(np.arange(n_clusters).astype(float) / n_clusters) labels = ms.labels_ if len(input_cols) > 1: pca2_model = PCA(n_components=2).fit(inputarr) pca2 = pca2_model.transform(inputarr) fig_centers = _mean_shift_centers_plot(input_cols, cluster_centers, colors) fig_samples = _mean_shift_samples_plot(table, input_cols, 100,cluster_centers, colors) if len(table.index) > 100 else _mean_shift_samples_plot(table, input_cols, None, cluster_centers, colors) if len(input_cols) > 1: fig_pca = _mean_shift_pca_plot(labels, cluster_centers, pca2_model, pca2, colors) rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Mean Shift Result | - Coordinates of cluster centers | {fig_cluster_centers} | - Samples | {fig_pca} | {fig_samples} | ### Parameters | {params} """.format(fig_cluster_centers=fig_centers, fig_pca=fig_pca, fig_samples=fig_samples, params=pandasDF2MD(param_table)))) else: rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Mean Shift Result | - Coordinates of cluster centers | {fig_cluster_centers} | - Samples | {fig_samples} | ### Parameters | {params} """.format(fig_cluster_centers=fig_centers, fig_samples=fig_samples, params=pandasDF2MD(param_table)))) model = _model_dict('mean_shift') model['model'] = ms model['input_cols'] = input_cols model['_repr_brtc_'] = rb.get() out_table = table.copy() out_table[prediction_col] = labels return {'out_table':out_table, 'model':model}
def _correlation(table, vars, method='pearson', height=2.5, corr_prec=2): validate(greater_than(height, 0, 'height'), greater_than_or_equal_to(corr_prec, 1, 'corr_prec')) size = len(vars) s_default = plt.rcParams['lines.markersize']**2. scatter_kws = {"s": s_default * height / 6.4} result_arr = [] for i in range(size): for j in range(i): if method == 'pearson': r, p = stats.pearsonr(table[vars[i]], table[vars[j]]) elif method == 'spearman': r, p = stats.spearmanr(table[vars[i]], table[vars[j]]) elif method == 'kendal': r, p = stats.kendalltau(table[vars[i]], table[vars[j]]) result_arr.append([vars[i], vars[j], r, p]) df_result = pd.DataFrame(result_arr, columns=['x', 'y', 'corr', 'p_value']) def corr(x, y, **kwargs): if kwargs['method'] == 'pearson': r, p = stats.pearsonr(x, y) elif kwargs['method'] == 'spearman': r, p = stats.spearmanr(x, y) elif kwargs['method'] == 'kendal': r, p = stats.kendalltau(x, y) p_stars = '' if p <= 0.05: p_stars = '*' if p <= 0.01: p_stars = '**' if p <= 0.001: p_stars = '***' corr_text = '{:.{prec}f}'.format(r, prec=corr_prec) font_size = abs(r) * 15 * 2 / corr_prec + 5 ax = plt.gca() ax.annotate(corr_text, [ .5, .5, ], xycoords="axes fraction", ha='center', va='center', fontsize=font_size * height) ax.annotate(p_stars, xy=(0.65, 0.6), xycoords=ax.transAxes, color='red', fontsize=17 * height) g = sns.PairGrid(table, vars=vars, height=height) g.map_diag(sns.distplot) if method == 'pearson': g.map_lower(sns.regplot, scatter_kws=scatter_kws) else: g.map_lower(sns.regplot, lowess=True, scatter_kws=scatter_kws) g.map_upper(corr, method=method) fig_corr = plt2MD(plt) plt.clf() rb = BrtcReprBuilder() rb.addMD( strip_margin(""" ## Correlation Results | ### Correlation Matrix | {fig_corr} | | ### Correlation Table | {table} """.format(fig_corr=fig_corr, table=pandasDF2MD(df_result)))) params = {'vars': vars, 'method': method, 'height': height} res = dict() res['params'] = params res['corr_table'] = df_result res['_repr_brtc_'] = rb.get() return {'result': res}
def _chi_square_test_of_independence(table, feature_cols, label_col, correction=False): rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Chi-square Test of Independence Result | - H0: the two categorical variables are independent. | - H1: the two categorical variables are dependent. """)) model = _model_dict('chi_square_test_of_independence') for idx, feature_col in enumerate(feature_cols): contingency_table = pd.crosstab(table[feature_col], table[label_col], margins=True) feature_index = len(contingency_table) - 1 label_index = len(contingency_table.columns) - 1 temporary = contingency_table.iloc[0:feature_index, 0:label_index] test = stats.chi2_contingency(np.array(temporary), correction, 1) stat_chi = test[0] dof = test[2] p_chi = test[1] if p_chi < 0.05: dependence = 'Reject the null hypothesis that two categorical variables are independent at 5% significance level.' elif p_chi >= 0.05: dependence = 'No association was found between two categorical variables at 5% significance level.' elif math.isnan(p_chi): dependence = 'Independence of two categorical variables cannot be decided.' data = {'estimate': stat_chi, 'df': dof, 'p_value': p_chi} result_table = pd.DataFrame([data], columns=['estimate', 'df', 'p_value']) model['result{}'.format(idx)] = result_table rb.addMD( strip_margin(""" |### Label: {label}, Feature: {feature} |###### Result Table {idx} | |{result_table} | |{dependence} | | """.format(label=label_col, feature=feature_col, idx=idx, result_table=pandasDF2MD(result_table), dependence=dependence))) model['_repr_brtc_'] = rb.get() return {'model': model}
def _one_sample_ttest_repr(statistics, result_dict, params): input_cols = params['input_cols'] alternatives = params['alternatives'] hypothesized_mean = params['hypothesized_mean'] conf_level = params['conf_level'] rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## One Sample T Test Result | - Statistics = {s} | - Hypothesized mean = {h} | - Confidence level = {cl} """.format(s=statistics, h=hypothesized_mean, cl=conf_level))) for input_col in input_cols: H1_list = [] p_list = [] CI_list = [] for alter in alternatives: test_info = result_dict[input_col][alter] H1_list.append(test_info['alternative_hypothesis']) p_list.append(test_info['p_value']) CI_list.append(test_info['confidence_interval']) result_table = pd.DataFrame.from_items([ ['alternative hypothesis', H1_list], ['p-value', p_list], ['%g%% confidence Interval' % (conf_level * 100), CI_list] ]) rb.addMD(strip_margin(""" | ### Data = {input_col} | - t-value = {t_value} | | {result_table} """.format(input_col=input_col, t_value=result_dict[input_col]['t_value'], result_table=pandasDF2MD(result_table)))) rb.addMD(strip_margin(""" | ### Parameters | {params} """.format(params=dict2MD(params)))) return rb
def _two_sample_ttest_for_stacked_data(table, response_cols, factor_col, alternatives, first=None , second=None , hypo_diff=0, equal_vari='pooled', confi_level=0.95): if first is not None or second is not None: check_table = np.array(table[factor_col]) for element in check_table: if element is not None: if type(element) != str: if type(element) == bool: if first is not None and second is not None: first = bool(first) second = bool(second) break if first is not None: first = bool(first) break second = bool(second) break else: if first is not None and second is not None: first = float(first) second = float(second) break if first is not None: first = float(first) break second = float(second) break else: break if first is None or second is None: tmp_factors=np.unique(table[factor_col]) if len(tmp_factors) != 2: raise_error('0719', 'factor_col') if first is None: if tmp_factors[0] != second: first = tmp_factors[0] else: first = tmp_factors[1] if second is None: if tmp_factors[0] != first: second = tmp_factors[0] else: second = tmp_factors[1] table_first = table[table[factor_col] == first] table_second = table[table[factor_col] == second] tmp_table = [] rb = BrtcReprBuilder() rb.addMD(strip_margin(""" ## Two Sample T Test for Stacked Data Result | - Hypothesized mean = {hypo_diff} | - Confidence level = {confi_level} """.format(hypo_diff=hypo_diff, confi_level=confi_level))) for response_col in response_cols: tmp_model = [] number1 = len(table_first[response_col]) number2 = len(table_second[response_col]) mean1 = (table_first[response_col]).mean() mean2 = (table_second[response_col]).mean() std1 = (table_first[response_col]).std() std2 = (table_second[response_col]).std() start_auto = 0 if equal_vari == 'auto': start_auto = 1 f_value = (std1 ** 2) / (std2 ** 2) f_test_p_value_tmp = stats.f.cdf(1 / f_value, number1 - 1, number2 - 1) if f_test_p_value_tmp > 0.5: f_test_p_value = (1 - f_test_p_value_tmp) * 2 else: f_test_p_value = f_test_p_value_tmp * 2 if f_test_p_value < 0.05: equal_vari = 'unequal' else: equal_vari = 'pooled' ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'larger', usevar=equal_vari, value=hypo_diff) if 'larger' in alternatives: ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'larger', usevar=equal_vari, value=hypo_diff) df = ttestresult[2] if equal_vari == 'pooled': std_number1number2 = sqrt(((number1 - 1) * (std1) ** 2 + (number2 - 1) * (std2) ** 2) / (number1 + number2 - 2)) margin = t.ppf((confi_level) , df) * std_number1number2 * sqrt(1 / number1 + 1 / number2) if equal_vari == 'unequal': margin = t.ppf((confi_level) , df) * sqrt(std1 ** 2 / (number1) + std2 ** 2 / (number2)) tmp_model += [['true difference in means > {}'.format(hypo_diff)] + [ttestresult[1]] + [(mean1 - mean2 - margin, math.inf)]] tmp_table += [['%s by %s(%s,%s)' % (response_col, factor_col, first, second)] + ['true difference in means > {}'.format(hypo_diff)] + ['t statistic, t distribution with %f degrees of freedom under the null hypothesis' % ttestresult[2]] + [ttestresult[0]] + [ttestresult[1]] + [confi_level] + [mean1 - mean2 - margin] + [math.inf]] if 'smaller' in alternatives: ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'smaller', usevar=equal_vari, value=hypo_diff) df = ttestresult[2] if equal_vari == 'pooled': std_number1number2 = sqrt(((number1 - 1) * (std1) ** 2 + (number2 - 1) * (std2) ** 2) / (number1 + number2 - 2)) margin = t.ppf((confi_level) , df) * std_number1number2 * sqrt(1 / number1 + 1 / number2) if equal_vari == 'unequal': margin = t.ppf((confi_level) , df) * sqrt(std1 ** 2 / (number1) + std2 ** 2 / (number2)) tmp_model += [['true difference in means < {}'.format(hypo_diff)] + [ttestresult[1]] + [(-math.inf, mean1 - mean2 + margin)]] tmp_table += [['%s by %s(%s,%s)' % (response_col, factor_col, first, second)] + ['true difference in means < {}'.format(hypo_diff)] + ['t statistic, t distribution with %f degrees of freedom under the null hypothesis' % ttestresult[2]] + [ttestresult[0]] + [ttestresult[1]] + [confi_level] + [-math.inf] + [mean1 - mean2 + margin]] if 'two-sided' in alternatives: ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'two-sided', usevar=equal_vari, value=hypo_diff) df = ttestresult[2] if equal_vari == 'pooled': std_number1number2 = sqrt(((number1 - 1) * (std1) ** 2 + (number2 - 1) * (std2) ** 2) / (number1 + number2 - 2)) margin = t.ppf((confi_level + 1) / 2 , df) * std_number1number2 * sqrt(1 / number1 + 1 / number2) if equal_vari == 'unequal': margin = t.ppf((confi_level + 1) / 2 , df) * sqrt(std1 ** 2 / (number1) + std2 ** 2 / (number2)) tmp_model += [['true difference in means != {}'.format(hypo_diff)] + [ttestresult[1]] + [(mean1 - mean2 - margin, mean1 - mean2 + margin)]] tmp_table += [['%s by %s(%s,%s)' % (response_col, factor_col, first, second)] + ['true difference in means != {}'.format(hypo_diff)] + ['t statistic, t distribution with %f degrees of freedom under the null hypothesis' % ttestresult[2]] + [ttestresult[0]] + [ttestresult[1]] + [confi_level] + [mean1 - mean2 - margin] + [mean1 - mean2 + margin]] result_model = pd.DataFrame.from_records(tmp_model) result_model.columns = ['alternative hypothesis', 'p-value', '%g%% confidence interval' % (confi_level * 100)] rb.addMD(strip_margin(""" | #### Data = {response_col} by {factor_col}({first},{second}) | - Statistics = t statistic, t distribution with {ttestresult2} degrees of freedom under the null hypothesis | - t-value = {ttestresult0} | | {result_model} | """.format(ttestresult2=ttestresult[2], response_col=response_col, factor_col=factor_col, first=first, second=second, ttestresult0=ttestresult[0], result_model=pandasDF2MD(result_model)))) if start_auto == 1: equal_vari = 'auto' result = pd.DataFrame.from_records(tmp_table) result.columns = ['data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'lower_confidence_interval', 'upper_confidence_interval'] model = dict() model['_repr_brtc_'] = rb.get() return {'out_table' : result, 'model' : model}
def _normality_test(table, input_cols, method=['kstest', 'jarque_bera', 'anderson']): result = dict() rb = BrtcReprBuilder() rb.addMD("""## Normality test Result""") test_name = { 'kstest': "Kolmogorov-Smirnov test", 'jarque_bera': "Jarque-Bera test", 'anderson': "Anderson-Darling test" } stats_name = { 'kstest': "KS statistic, asymptotically Kolmogorov distribution under the null hypothesis.", 'jarque_bera': "JB statistic, asymptotically chi-square distribution with 2 degrees of freedom under the null hypothesis.", 'anderson': "A^2 statistic. The p-value is computed from the adjusted statistic." } if 'kstest' in method: stats_res = dict() stats_res['data'] = [] stats_res['estimates'] = [] stats_res['p_value'] = [] result['kstest'] = dict() for input_col in input_cols: stats, pval = kstest(table[input_col], 'norm', mode='asymp') stats_res['data'].append(input_col) stats_res['estimates'].append(stats) stats_res['p_value'].append(pval) result['kstest'][input_col] = {'estimates': stats, 'p_value': pval} rb.addMD( strip_margin(""" | ## {method} result |{stats_table} """.format(method=test_name['kstest'], stats_table=pandasDF2MD(pd.DataFrame(stats_res))))) if 'jarque_bera' in method: stats_res = dict() stats_res['data'] = [] stats_res['estimates'] = [] stats_res['p_value'] = [] result['jarque_bera'] = dict() for input_col in input_cols: stats, pval = jarque_bera(table[input_col]) stats_res['data'].append(input_col) stats_res['estimates'].append(stats) stats_res['p_value'].append(pval) result['jarque_bera'][input_col] = { 'estimates': stats, 'p_value': pval } rb.addMD( strip_margin(""" | ## {method} result |{stats_table} """.format(method=test_name['jarque_bera'], stats_table=pandasDF2MD(pd.DataFrame(stats_res))))) if 'anderson' in method: stats_res = dict() stats_res['data'] = [] stats_res['estimates'] = [] stats_res['critical value'] = [] stats_res['significance level'] = [] result['anderson'] = dict() for input_col in input_cols: stats, critical_val, significance_lvl = anderson(table[input_col], dist='norm') stats_res['data'] += [input_col] * len(critical_val) stats_res['estimates'] += [stats] * len(critical_val) stats_res['critical value'] += list(critical_val) stats_res['significance level'] += list(significance_lvl) result['anderson'][input_col] = { 'estimates': [stats] * len(critical_val), 'critical value': list(critical_val), 'significance level': list(significance_lvl) } rb.addMD( strip_margin(""" | ## {method} result |{stats_table} """.format(method=test_name['anderson'], stats_table=pandasDF2MD(pd.DataFrame(stats_res))))) result['_repr_brtc_'] = rb.get() return {'result': result}
def _duncan_test(table, response_cols, factor_col, alpha=0.05): result = dict() rb = BrtcReprBuilder() rb.addMD("""## Duncan test Result""") for response_col in response_cols: mean_by_factor = table.groupby(factor_col).mean()[response_col].sort_values(ascending=False) count_by_factor = table.groupby(factor_col).count()[response_col] columns = list(table.columns) sse = np.sum([np.square(row[columns.index(response_col)] - mean_by_factor[row[columns.index(factor_col)]]) for row in table.values]) df = table.shape[0] - count_by_factor.shape[0] mse = sse / df n = harmonic_mean(count_by_factor) sigma_d = np.sqrt(mse / n) classes = table[factor_col].unique() classes_cnt = len(classes) critical_val = dict() critical_val['p'] = range(2, classes_cnt + 1) critical_val['critical_value'] = [] p = 1 - alpha for i in range(1, classes_cnt): if p < 0.1 or p > 0.999: critical_val['critical_value'].append('Not statistically meaningful') else: critical_val['critical_value'].append(sigma_d * qsturng(p, i + 1, df)) p = p * (1 - alpha) comp_by_factor = dict() comp_by_factor['compared_factors'] = [] comp_by_factor['difference'] = [] comp_by_factor['critical_value'] = [] comp_by_factor['significant'] = [] titles = mean_by_factor.index for i in range(classes_cnt): for j in range(i + 1, classes_cnt): title = str(titles[i]) + ' - ' + str(titles[j]) comp_by_factor['compared_factors'].append(title) difference = abs(mean_by_factor[titles[i]] - mean_by_factor[titles[j]]) comp_by_factor['difference'].append(difference) critical_value = critical_val['critical_value'][critical_val['p'].index(j - i + 1)] comp_by_factor['critical_value'].append(critical_value) if isinstance(critical_value, (float, int)): if difference > critical_value: comp_by_factor['significant'].append('YES') else: comp_by_factor['significant'].append('NO') else: comp_by_factor['significant'].append(critical_value) critical_val = pd.DataFrame(critical_val) mean_by_factor = pd.DataFrame(mean_by_factor).reset_index() comp_by_factor = pd.DataFrame(comp_by_factor) rb.addMD(strip_margin(""" | ## {response_col} by {factor_col} | | ### Critical value | {critical_val} | | ### Mean value by factor | {mean_by_factor} | | ### Difference by factor | {comp_by_factor} """.format(response_col=response_col, factor_col=factor_col, critical_val=pandasDF2MD(critical_val, num_rows=critical_val.shape[0]), mean_by_factor=pandasDF2MD(mean_by_factor, num_rows=mean_by_factor.shape[0]), comp_by_factor=pandasDF2MD(comp_by_factor, num_rows=comp_by_factor.shape[0])))) group = response_col + '_' + factor_col result[group] = dict() result[group]['critical_val'] = critical_val result[group]['mean_by_factor'] = mean_by_factor result[group]['comp_by_factor'] = comp_by_factor result['_repr_brtc_'] = rb.get() return {'result': result}
def _pca(table, input_cols, new_column_name='projected_', n_components=None, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', seed=None, hue=None, alpha=0, key_col=None): num_feature_cols = len(input_cols) if n_components is None: n_components = num_feature_cols pca = PCA(None, copy, whiten, svd_solver, tol, iterated_power, random_state=seed) pca_model = pca.fit(table[input_cols]) column_names = [] for i in range(0, n_components): column_names.append(new_column_name + str(i)) # print(column_names) pca_result = pca_model.transform(table[input_cols]) out_df = pd.DataFrame(data=pca_result[:, :n_components], columns=[column_names]) out_df = pd.concat([table.reset_index(drop=True), out_df], axis=1) out_df.columns = table.columns.values.tolist() + column_names res_components = pca_model.components_ res_components_df = pd.DataFrame(data=res_components[:n_components], columns=[input_cols]) res_explained_variance = pca_model.explained_variance_ res_explained_variance_ratio = pca_model.explained_variance_ratio_ res_singular_values = pca_model.singular_values_ res_mean = pca_model.mean_ res_n_components = pca_model.n_components_ res_noise_variance = pca_model.noise_variance_ res_get_param = pca_model.get_params() res_get_covariance = pca_model.get_covariance() res_get_precision = pca_model.get_precision() # visualization plt.figure() if n_components == 1: sns.scatterplot(column_names[0], column_names[0], hue=hue, data=out_df) plt_two = plt2MD(plt) plt.clf() else: plt_two = _biplot( 0, 1, pc_columns=column_names, columns=input_cols, singular_values=res_singular_values, components=res_components, explained_variance_ratio=res_explained_variance_ratio, alpha=alpha, hue=hue, data=out_df, ax=plt.gca(), key_col=key_col) plt.figure() fig_scree = _screeplot(res_explained_variance, res_explained_variance_ratio, n_components) table_explained_variance = pd.DataFrame(res_explained_variance, columns=['explained_variance']) table_explained_variance[ 'explained_variance_ratio'] = res_explained_variance_ratio table_explained_variance[ 'cum_explained_variance_ratio'] = res_explained_variance_ratio.cumsum( ) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## PCA Result | ### Plot | {image1} | | ### Explained Variance | {fig_scree} | {table_explained_variance} | | ### Components | {table2} | | ### Parameters | {parameter1} """.format(image1=plt_two, fig_scree=fig_scree, table_explained_variance=pandasDF2MD(table_explained_variance), parameter1=dict2MD(res_get_param), table2=pandasDF2MD(res_components_df)))) model = _model_dict('pca') model['components'] = res_components model['explained_variance'] = res_explained_variance model['explained_variance_ratio'] = res_explained_variance_ratio model['singular_values'] = res_singular_values model['mean'] = res_mean model['n_components'] = res_n_components model['noise_variance'] = res_noise_variance model['parameters'] = res_get_param model['covariance'] = res_get_covariance model['precision'] = res_get_precision model['_repr_brtc_'] = rb.get() model['pca_model'] = pca_model model['input_cols'] = input_cols return {'out_table': out_df, 'model': model}
def _cross_table(table, input_cols_1, input_cols_2, result='N', margins=False): df1 = [table[col] for col in input_cols_1] df2 = [table[col] for col in input_cols_2] # cross table if result == 'N': result_table = pd.crosstab(df1, df2, margins=margins) elif result == 'N / Row Total': result_table = pd.crosstab(df1, df2, margins=margins, normalize='index') elif result == 'N / Column Total': result_table = pd.crosstab(df1, df2, margins=margins, normalize='columns') elif result == 'N / Total': result_table = pd.crosstab(df1, df2, margins=margins, normalize='all') else: raise_runtime_error("Please check 'result'.") # each row and column name row_names = list(result_table.index)[:] if len(input_cols_1) == 1: joined_row_name = [str(i) for i in row_names] else: if margins == False: joined_row_name = [ '_'.join(str(s) for s in row_names[i]) for i in range(len(row_names)) ] elif margins == True: joined_row_name = [ '_'.join(str(s) for s in row_names[i]) for i in range(len(row_names) - 1) ] + [row_names[-1][0]] column_names = list(result_table.columns)[:] if len(input_cols_2) == 1: joined_column_name = [str(i) for i in column_names] else: if margins == False: joined_column_name = [ '_'.join(str(s) for s in column_names[i]) for i in range(len(column_names)) ] elif margins == True: joined_column_name = [ '_'.join(str(s) for s in column_names[i]) for i in range(len(column_names) - 1) ] + [column_names[-1][0]] # cross table if result == 'N': result_table.insert(loc=0, column=' ', value=joined_row_name) result_table.columns = np.append('N', joined_column_name) # cross table normalize by row elif result == 'N / Row Total': result_table.insert(loc=0, column=' ', value=joined_row_name) result_table.columns = np.append('N / Row Total', joined_column_name) # cross table normalize by column elif result == 'N / Column Total': result_table.insert(loc=0, column=' ', value=joined_row_name) result_table.columns = np.append('N / Column Total', joined_column_name) # cross table normalize by all values elif result == 'N / Total': result_table.insert(loc=0, column=' ', value=joined_row_name) result_table.columns = np.append('N / Total', joined_column_name) else: raise_runtime_error("Please check 'result'.") rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Cross Table Result | ### Result Type : {result} | | #### Result Table | | {result_table} | """.format(result=result, result_table=pandasDF2MD(result_table, num_rows=len(result_table.index) + 1)))) model = _model_dict('cross_table') model['result'] = result model['result_table'] = result_table model['_repr_brtc_'] = rb.get() return {'model': model}
def _mlp_regression_train(table, feature_cols, label_col, hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001, batch_size_auto=True, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, max_iter=200, random_state=None, tol=0.0001): features = table[feature_cols] label = table[label_col] mlp_model = MLPRegressor(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, alpha=alpha, batch_size=batch_size, learning_rate=learning_rate, learning_rate_init=learning_rate_init, max_iter=max_iter, shuffle=True, random_state=random_state, tol=tol) mlp_model.fit(features, label) predict = mlp_model.predict(features) intercepts = mlp_model.intercepts_ coefficients = mlp_model.coefs_ loss = mlp_model.loss_ _mean_absolute_error = mean_absolute_error(label, predict) _mean_squared_error = mean_squared_error(label, predict) _r2_score = r2_score(label, predict) result_table = pd.DataFrame.from_items( [['Metric', ['Mean Absolute Error', 'Mean Squared Error', 'R2 Score']], ['Score', [_mean_absolute_error, _mean_squared_error, _r2_score]]]) label_name = { 'hidden_layer_sizes': 'Hidden Layer Sizes', 'activation': 'Activation Function', 'solver': 'Solver', 'alpha': 'Alpha', 'batch_size': 'Batch Size', 'learning_rate': 'Learning Rate', 'learning_rate_init': 'Learning Rate Initial', 'max_iter': 'Max Iteration', 'random_state': 'Seed', 'tol': 'Tolerance' } get_param = mlp_model.get_params() param_table = pd.DataFrame.from_items( [['Parameter', list(label_name.values())], ['Value', [get_param[x] for x in list(label_name.keys())]]]) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ### MLP Classification Result | {result} | ### Parameters | {list_parameters} """.format(result=pandasDF2MD(result_table), list_parameters=pandasDF2MD(param_table)))) model = _model_dict('mlp_regression_model') model['features'] = feature_cols model['label'] = label_col model['intercepts'] = mlp_model.intercepts_ model['coefficients'] = mlp_model.coefs_ model['loss'] = mlp_model.loss_ model['mean_absolute_error'] = _mean_absolute_error model['mean_squared_error'] = _mean_squared_error model['r2_score'] = _r2_score model['activation'] = activation model['solver'] = solver model['alpha'] = alpha model['batch_size'] = batch_size model['learning_rate'] = learning_rate model['learning_rate_init'] = learning_rate_init model['max_iter'] = max_iter model['random_state'] = random_state model['tol'] = tol model['mlp_model'] = mlp_model model['_repr_brtc_'] = rb.get() return {'model': model}
def _hierarchical_clustering_post(model, num_clusters, cluster_col='cluster'): Z = model['model'] mode = model['input_mode'] out_table = model['linkage_matrix'] predict = fcluster(Z, t=num_clusters, criterion='maxclust') if mode == 'original': prediction_table = model['table'] elif mode == 'matrix': prediction_table = model['dist_matrix'][['name']] if num_clusters == 1: prediction_table[cluster_col] = [ 1 for _ in range(len(prediction_table.index)) ] else: prediction_table[cluster_col] = predict L, M = leaders(Z, predict) which_cluster = [] for leader in L: if leader in Z[:, 0]: select_indices = np.where(Z[:, 0] == leader)[0][0] which_cluster.append(out_table['joined column1'][select_indices]) elif leader in Z[:, 1]: select_indices = np.where(Z[:, 1] == leader)[0][0] which_cluster.append(out_table['joined column2'][select_indices]) clusters_info_table = pd.DataFrame([]) if num_clusters == 1: clusters_info_table[cluster_col] = [1] clusters_info_table['name of clusters'] = [ out_table['name of clusters'][len(Z) - 1] ] clusters_info_table['number of entities'] = [ out_table['number of original'][len(Z) - 1] ] else: clusters_info_table[cluster_col] = M clusters_info_table['name of clusters'] = which_cluster clusters_info_table = clusters_info_table.sort_values(cluster_col) cluster_count = np.bincount(prediction_table[cluster_col]) cluster_count = cluster_count[cluster_count != 0] clusters_info_table['number of entities'] = list(cluster_count) rb = BrtcReprBuilder() rb.addMD(strip_margin("""# Hierarchical Clustering Post Process Result""")) rb.addMD( strip_margin(""" |### Parameters | |{display_params} | |### Clusters Information | |{clusters_info_table} | """.format(display_params=dict2MD(model['parameters']), clusters_info_table=pandasDF2MD( clusters_info_table, num_rows=len(clusters_info_table.index) + 1)))) model = _model_dict('hierarchical_clustering_post_process') model['clusters_info'] = clusters_info_table model['_repr_brtc_'] = rb.get() return {'out_table': prediction_table, 'model': model}
def _oneway_anova(table, response_cols, factor_col): rb = BrtcReprBuilder() rb.addMD( strip_margin(""" ## One-way Analysis of Variance Result """)) groups = table[factor_col].unique() groups.sort() sum_len = np.sum([len(str(group)) for group in groups]) result = dict() result['_grouped_data'] = dict() for response_col in response_cols: data = table[response_col] result['_grouped_data'][response_col] = dict() ax = sns.boxplot(x=factor_col, y=response_col, data=table, order=groups) if sum_len > 512: ax.set_xticklabels(ax.get_xticklabels(), rotation=90) elif sum_len > 64: ax.set_xticklabels(ax.get_xticklabels(), rotation=45) fig_box = plt2MD(plt) plt.clf() model = ols( """Q('{response_col}') ~ C(Q('{factor_col}'))""".format( response_col=response_col, factor_col=factor_col), table).fit() # TODO factor_col = class => error anova = anova_lm(model) index_list = anova.index.tolist() remove_list = ["C(Q('", "'))", "Q('", "')"] for v in remove_list: index_list = [i.replace(v, "") for i in index_list] anova.insert(0, '', index_list) anova_df = pandasDF2MD(anova) p_value = anova["""PR(>F)"""][0] residual = model.resid sns.distplot(residual) distplot = plt2MD(plt) plt.clf() sm.qqplot(residual, line='s') qqplot = plt2MD(plt) plt.clf() rb.addMD( strip_margin(""" | ## {response_col} by {factor_col} | {fig_box} | | ### ANOVA | {anova_df} | | ### Diagnostics | {distplot} | | {qqplot} """.format(response_col=response_col, factor_col=factor_col, fig_box=fig_box, anova_df=anova_df, distplot=distplot, qqplot=qqplot))) result['_grouped_data'][response_col]['p_value'] = p_value result['_repr_brtc_'] = rb.get() return {'result': result}
def _penalized_linear_regression_train(table, feature_cols, label_col, regression_type='ridge', alpha=1.0, l1_ratio=0.5, fit_intercept=True, max_iter=1000, tol=0.0001, random_state=None): out_table = table.copy() features = out_table[feature_cols] label = out_table[label_col] if regression_type == 'ridge': regression_model = Ridge(alpha=alpha, fit_intercept=fit_intercept, max_iter=None, tol=tol, solver='auto', random_state=random_state) elif regression_type == 'lasso': regression_model = Lasso(alpha=alpha, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, random_state=random_state, selection='random') elif regression_type == 'elastic_net': regression_model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, random_state=random_state, selection='random') else: raise_runtime_error("Please check 'regression_type'.") regression_model.fit(features, label) out_table1 = pd.DataFrame([]) out_table1['x_variable_name'] = [variable for variable in feature_cols] out_table1['coefficient'] = regression_model.fit(features, label).coef_ intercept = pd.DataFrame([['intercept', regression_model.fit(features, label).intercept_]], columns=['x_variable_name', 'coefficient']) if fit_intercept == True: out_table1 = out_table1.append(intercept, ignore_index=True) predict = regression_model.predict(features) residual = label - predict out_table['predict'] = predict out_table['residual'] = residual if regression_type == 'elastic_net': params = { 'Feature Columns' : feature_cols, 'Label Column' : label_col, 'Regression Type': regression_type, 'Regularization (Penalty Weight)' : alpha, 'L1 Ratio': l1_ratio, 'Fit Intercept' : fit_intercept, 'Maximum Number of Iterations' : max_iter, 'Tolerance' : tol } else: params = { 'Feature Columns' : feature_cols, 'Label Column' : label_col, 'Regression Type': regression_type, 'Regularization (Penalty Weight)' : alpha, 'Fit Intercept' : fit_intercept, 'Maxium Number of Iterations' : max_iter, 'Tolerance' : tol } score = { 'MSE' : mean_squared_error(label, predict), 'R2' : r2_score(label, predict) } plt.figure() plt.scatter(predict, label) plt.xlabel('Predicted values for ' + label_col) plt.ylabel('Actual values for ' + label_col) x = predict p1x = np.min(x) p2x = np.max(x) plt.plot([p1x, p2x], [p1x, p2x], 'r--') fig_actual_predict = plt2MD(plt) plt.clf() plt.figure() plt.scatter(predict, residual) plt.xlabel('Predicted values for ' + label_col) plt.ylabel('Residuals') plt.axhline(y=0, color='r', linestyle='--') fig_residual_1 = plt2MD(plt) plt.clf() plt.figure() sm.qqplot(residual, line='s') plt.ylabel('Residuals') fig_residual_2 = plt2MD(plt) plt.clf() plt.figure() sns.distplot(residual) plt.xlabel('Residuals') fig_residual_3 = plt2MD(plt) plt.clf() # checking the magnitude of coefficients plt.figure() predictors = features.columns coef = Series(regression_model.coef_, predictors).sort_values() coef.plot(kind='bar', title='Model Coefficients') plt.tight_layout() fig_model_coefficients = plt2MD(plt) plt.clf() rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | # Penalized Linear Regression Result | ### Selected Parameters: | {params} | | ## Results | ### Model Parameters | {out_table1} | | ### Prediction and Residual | {out_table2} | | ### Regression Score | {score} | """.format(params=dict2MD(params), out_table1=pandasDF2MD(out_table1), out_table2=pandasDF2MD(out_table, num_rows=len(out_table) + 1), score=dict2MD(score)))) rb.addMD(strip_margin(""" | | ### Predicted vs Actual | {image1} | | ### Fit Diagnostics | {image2} | {image3} | {image4} | | ### Magnitude of Coefficients | {image5} | """.format(image1=fig_actual_predict, image2=fig_residual_1, image3=fig_residual_2, image4=fig_residual_3, image5=fig_model_coefficients ))) model = _model_dict('penalized_linear_regression_model') model['feature_cols'] = feature_cols model['label_col'] = label_col model['regression_type'] = regression_type model['regression_model'] = regression_model model['model_parameters'] = out_table1 model['prediction_residual'] = out_table model['_repr_brtc_'] = rb.get() return {'model' : model}
def _logistic_regression_train(table, feature_cols, label_col, penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1): feature_names, features = check_col_type(table, feature_cols) label = table[label_col] if (sklearn_utils.multiclass.type_of_target(label) == 'continuous'): raise_error('0718', 'label_col') lr_model = LogisticRegression(penalty, dual, tol, C, fit_intercept, intercept_scaling, class_weight, random_state, solver, max_iter, multi_class, verbose, warm_start, n_jobs) lr_model.fit(features, label) new_features = pd.DataFrame({ "Constant": np.ones(len(features)) }).join(pd.DataFrame(features)) intercept = lr_model.intercept_ coefficients = lr_model.coef_ classes = lr_model.classes_ is_binary = len(classes) == 2 prob = lr_model.predict_proba(features) prob_trans = prob.T classes_dict = dict() for i in range(len(classes)): classes_dict[classes[i]] = i tmp_label = np.array([classes_dict[i] for i in label]) likelihood = 1 for i in range(len(table)): likelihood *= prob_trans[tmp_label[i]][i] if fit_intercept: k = len(feature_cols) + 1 else: k = len(feature_cols) aic = 2 * k - 2 * np.log(likelihood) bic = np.log(len(table)) * k - 2 * np.log(likelihood) if is_binary: if fit_intercept: x_design = np.hstack([np.ones((features.shape[0], 1)), features]) else: x_design = features.values v = np.product(prob, axis=1) x_design_modi = np.array( [x_design[i] * v[i] for i in range(len(x_design))]) cov_logit = np.linalg.inv(np.dot(x_design_modi.T, x_design)) std_err = np.sqrt(np.diag(cov_logit)) if fit_intercept: logit_params = np.insert(coefficients, 0, intercept) else: logit_params = coefficients wald = (logit_params / std_err)**2 p_values = 1 - chi2.cdf(wald, 1) else: if fit_intercept: x_design = np.hstack([np.ones((features.shape[0], 1)), features]) else: x_design = features.values std_err = [] for i in range(len(classes)): v = prob.T[i] * (1 - prob.T[i]) x_design_modi = np.array( [x_design[i] * v[i] for i in range(len(x_design))]) cov_logit = np.linalg.inv(np.dot(x_design_modi.T, x_design)) std_err.append(np.sqrt(np.diag(cov_logit))) std_err = np.array(std_err) #print(math.log(likelihood)) if (fit_intercept == True): summary = pd.DataFrame({'features': ['intercept'] + feature_names}) coef_trans = np.concatenate(([intercept], np.transpose(coefficients)), axis=0) else: summary = pd.DataFrame({'features': feature_names}) coef_trans = np.transpose(coefficients) if not is_binary: summary = pd.concat( (summary, pd.DataFrame(coef_trans, columns=classes)), axis=1) else: summary = pd.concat( (summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1) if is_binary: summary = pd.concat( (summary, pd.DataFrame(std_err, columns=['standard_error']), pd.DataFrame(wald, columns=['wald_statistic']), pd.DataFrame(p_values, columns=['p_value'])), axis=1) else: columns = [ 'standard_error_{}'.format(classes[i]) for i in range(len(classes)) ] summary = pd.concat( (summary, pd.DataFrame(std_err.T, columns=columns)), axis=1) arrange_col = ['features'] for i in range(len(classes)): arrange_col.append(classes[i]) arrange_col.append('standard_error_{}'.format(classes[i])) summary = summary[arrange_col] if is_binary: rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Logistic Regression Result | ### Summary | {table1} | | ##### Column '{small}' is the coefficients under the assumption ({small} = 0, {big} = 1). | | #### AIC : {aic} | | #### BIC : {bic} """.format(small=classes[0], big=classes[1], table1=pandasDF2MD(summary, num_rows=100), aic=aic, bic=bic))) else: rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Logistic Regression Result | ### Summary | {table1} | | ##### Each column whose name is one of classes of Label Column is the coefficients under the assumption it is 1 and others are 0. | | ##### For example, column '{small}' is the coefficients under the assumption ({small} = 1, others = 0). | | #### AIC : {aic} | | #### BIC : {bic} """.format(small=classes[0], table1=pandasDF2MD(summary, num_rows=100), aic=aic, bic=bic))) model = _model_dict('logistic_regression_model') model['standard_errors'] = std_err model['aic'] = aic model['bic'] = bic if is_binary: model['wald_statistics'] = wald model['p_values'] = p_values model['features'] = feature_cols model['label'] = label_col model['intercept'] = lr_model.intercept_ model['coefficients'] = lr_model.coef_ model['class'] = lr_model.classes_ model['penalty'] = penalty model['solver'] = solver model['lr_model'] = lr_model model['_repr_brtc_'] = rb.get() model['summary'] = summary return {'model': model}
def _als_train(table, user_col, item_col, rating_col, mode = 'train', number=10, filter = True, implicit = False, iterations = 10, reg_param = 0.1, rank = 10, alpha = 1.0, seed = None, targets = None, workers = 1): table_user_col = table[user_col] table_item_col = table[item_col] rating_col = table[rating_col] rating_col = np.where(rating_col == 0, -1, rating_col) user_encoder = preprocessing.LabelEncoder() item_encoder = preprocessing.LabelEncoder() user_encoder.fit(table_user_col) item_encoder.fit(table_item_col) user_correspond = user_encoder.transform(table_user_col) item_correspond = item_encoder.transform(table_item_col) item_users = csr_matrix((rating_col,(item_correspond,user_correspond))) als_model = AlternatingLeastSquares(factors = rank,implicit = implicit,iterations = iterations, regularization = reg_param, alpha = alpha, seed = seed) als_model.fit(item_users) tmp_col = list(als_model.user_factors) for i in range(len(tmp_col)): tmp_col[i] = list(tmp_col[i]) user_factors = pd.DataFrame(user_encoder.classes_, columns = [user_col]) user_factors['features'] = tmp_col tmp_col = list(als_model.item_factors) for i in range(len(tmp_col)): tmp_col[i] = list(tmp_col[i]) item_factors = pd.DataFrame(item_encoder.classes_, columns = [item_col]) item_factors['features'] = tmp_col if mode == 'Topn': if targets is None: targets = user_encoder.classes_ if table_user_col.dtype in (np.floating,float,np.int,int,np.int64): targets = [float(i) for i in targets] targets_en = user_encoder.transform(targets) user_items = item_users.T.tocsr() Topn_result = [] if workers == 1: for user in targets_en: recommendations_corre = als_model.recommend(user, user_items, number, filter_already_liked_items= filter) recommendations = [] for (item,rating) in recommendations_corre: recommendations += [item_encoder.inverse_transform([item])[0],rating] Topn_result += [recommendations] else: Topn_result_tmp = apply_by_multiprocessing_list_to_list(targets_en, _recommend_multi, user_items = user_items, number = number, item_encoder = item_encoder, als_model = als_model, workers = workers, filter = filter) Topn_result=[] for i in range(workers): Topn_result += Topn_result_tmp[i] Topn_result = pd.DataFrame(Topn_result) Topn_result = pd.concat([pd.DataFrame(targets), Topn_result], axis=1, ignore_index=True) column_names=['user'] for i in range(number): column_names += ['item_top%d' %(i+1),'rating_top%d' %(i+1)] Topn_result.columns = column_names return {'out_table' : Topn_result} parameters = dict() parameters['Iterations'] = iterations parameters['Reg Param'] = reg_param parameters['Seed'] = seed parameters['Rank'] = rank if implicit: parameters['alpha'] = alpha rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## ALS Train Result | | ### Parameters | {parameters} | ### Item Factors | {item_factors} | ### User Factors | {user_factors} | """.format(item_factors=pandasDF2MD(item_factors, num_rows = 100), user_factors=pandasDF2MD(user_factors, num_rows = 100), parameters=dict2MD(parameters)))) model = _model_dict('ALS') model['als_model'] = als_model model['item_encoder'] = item_encoder model['user_encoder'] = user_encoder model['user_col'] = user_col model['item_col'] = item_col model['user_factors'] = user_factors model['item_factors'] = item_factors model['_repr_brtc_'] = rb.get() return{'model' : model}
def dataframe_to_md(table, n=20, precision=None, max_width=None): return pandasDF2MD(table, num_rows=n)
def _hierarchical_clustering_post(model, num_clusters, cluster_col='cluster'): if 'linkage_matrix' not in model: model_table = model['table_1'] length = len(model_table) + 1 tmp_table = model_table[[ 'clusters_joined1', 'clusters_joined2', 'height', 'frequency' ]] tmp = [ i for i in tmp_table[['clusters_joined1', 'clusters_joined2' ]].values.flatten() if i.split("_")[0] != 'CL' ] label_encoder = preprocessing.LabelEncoder().fit(tmp) tmp_table['clusters_joined2'] = tmp_table['clusters_joined2'].apply( _change_name, length=length, encoder=label_encoder) tmp_table['clusters_joined1'] = tmp_table['clusters_joined1'].apply( _change_name, length=length, encoder=label_encoder) Z = tmp_table.values predict = fcluster(Z, t=num_clusters, criterion='maxclust') data_names = ['pt_' + str(i) for i in range(length)] prediction_table = pd.DataFrame() prediction_table['name'] = data_names else: Z = model['model'] mode = model['input_mode'] out_table = model['linkage_matrix'] predict = fcluster(Z, t=num_clusters, criterion='maxclust') if mode == 'original': prediction_table = model['table'] elif mode == 'matrix': prediction_table = model['dist_matrix'][['name']] if num_clusters == 1: prediction_table[cluster_col] = [ 1 for _ in range(len(prediction_table.index)) ] else: prediction_table[cluster_col] = predict L, M = leaders(Z, predict) which_cluster = [] if 'linkage_matrix' not in model: for leader in L: which_cluster.append('CL_' + str(2 * length - 1 - leader)) else: for leader in L: if leader in Z[:, 0]: select_indices = np.where(Z[:, 0] == leader)[0][0] which_cluster.append( out_table['joined column1'][select_indices]) elif leader in Z[:, 1]: select_indices = np.where(Z[:, 1] == leader)[0][0] which_cluster.append( out_table['joined column2'][select_indices]) clusters_info_table = pd.DataFrame([]) if num_clusters == 1 and 'linkage_matrix' in model: clusters_info_table[cluster_col] = [1] clusters_info_table['name of clusters'] = [ out_table['name of clusters'][len(Z) - 1] ] clusters_info_table['number of entities'] = [ out_table['number of original'][len(Z) - 1] ] else: clusters_info_table[cluster_col] = M clusters_info_table['name of clusters'] = which_cluster clusters_info_table = clusters_info_table.sort_values(cluster_col) cluster_count = np.bincount(prediction_table[cluster_col]) cluster_count = cluster_count[cluster_count != 0] clusters_info_table['number of entities'] = list(cluster_count) if 'linkage_matrix' in model: rb = BrtcReprBuilder() rb.addMD( strip_margin("""# Hierarchical Clustering Post Process Result""")) rb.addMD( strip_margin(""" |### Parameters | |{display_params} | |### Clusters Information | |{clusters_info_table} | """.format(display_params=dict2MD(model['parameters']), clusters_info_table=pandasDF2MD( clusters_info_table, num_rows=len(clusters_info_table.index) + 1)))) else: rb = BrtcReprBuilder() rb.addMD( strip_margin("""# Hierarchical Clustering Post Process Result""")) rb.addMD( strip_margin(""" | |### Clusters Information | |{clusters_info_table} | """.format(clusters_info_table=pandasDF2MD( clusters_info_table, num_rows=len(clusters_info_table.index) + 1)))) model = _model_dict('hierarchical_clustering_post_process') model['clusters_info'] = clusters_info_table model['_repr_brtc_'] = rb.get() return {'out_table': prediction_table, 'model': model}
def _ftest_for_stacked_data(table, response_cols, factor_col, alternatives, first=None, second=None, confi_level=0.95): if (type(table[factor_col][0]) != str): if (type(table[factor_col][0]) == bool): if (first != None): first = bool(first) if (second != None): second = bool(second) else: if (first != None): first = float(first) if (second != None): second = float(second) if (first == None or second == None): tmp_factors = [] if (first != None): tmp_factors += [first] if (second != None): tmp_factors += [second] for i in range(len(table[factor_col])): if (table[factor_col][i] != None and table[factor_col][i] not in tmp_factors): if (len(tmp_factors) == 2): raise Exception("There are more that 2 factors.") else: tmp_factors += [table[factor_col][i]] if (first == None): if (tmp_factors[0] != second): first = tmp_factors[0] else: first = tmp_factors[1] if (second == None): if (tmp_factors[0] != first): second = tmp_factors[0] else: second = tmp_factors[1] table_first = table[table[factor_col] == first] table_second = table[table[factor_col] == second] tmp_table = [] number1 = len(table_first[factor_col]) number2 = len(table_second[factor_col]) d_num = number1 - 1 d_denum = number2 - 1 rb = BrtcReprBuilder() rb.addMD( strip_margin(""" ## F Test for Stacked Data Result | - Confidence level = {confi_level} | - Statistics = F statistic, F distribution with {d_num} numerator degrees of freedom and {d_denum} degrees of freedom under the null hypothesis """.format(confi_level=confi_level, d_num=d_num, d_denum=d_denum))) for response_col in response_cols: tmp_model = [] std1 = (table_first[response_col]).std() std2 = (table_second[response_col]).std() f_value = (std1**2) / (std2**2) if 'larger' in alternatives: p_value = scipy.stats.f.cdf(1 / f_value, d_num, d_denum) tmp_model += [ ['true ratio > 1'] + [p_value] + [(f_value / (scipy.stats.f.ppf(confi_level, d_num, d_denum)), math.inf)] ] tmp_table += [[ '%s by %s(%s,%s)' % (response_col, factor_col, first, second) ] + ['true ratio of variances > 1'] + [ 'F statistic, F distribution with %d numerator degrees of freedom and %d degrees of freedom under the null hypothesis.' % (d_num, d_denum) ] + [f_value] + [p_value] + [confi_level] + [ f_value / (scipy.stats.f.ppf(confi_level, d_num, d_denum)) ] + [math.inf]] if 'smaller' in alternatives: p_value = scipy.stats.f.cdf(f_value, d_num, d_denum) tmp_model += [['true ratio < 1'] + [p_value] + [(0.0, f_value * (scipy.stats.f.ppf(confi_level, d_denum, d_num)))]] tmp_table += [[ '%s by %s(%s,%s)' % (response_col, factor_col, first, second) ] + ['true ratio of variances < 1'] + [ 'F statistic, F distribution with %d numerator degrees of freedom and %d degrees of freedom under the null hypothesis.' % (d_num, d_denum) ] + [f_value] + [p_value] + [confi_level] + [0.0] + [ f_value * (scipy.stats.f.ppf(confi_level, d_denum, d_num)) ]] if 'two-sided' in alternatives: p_value_tmp = scipy.stats.f.cdf(1 / f_value, d_num, d_denum) if (p_value_tmp > 0.5): p_value = (1 - p_value_tmp) * 2 else: p_value = p_value_tmp * 2 tmp_model += [ ['true ratio != 1'] + [p_value] + [(f_value / (scipy.stats.f.ppf( (1 + confi_level) / 2, d_num, d_denum)), f_value * (scipy.stats.f.ppf((1 + confi_level) / 2, d_denum, d_num)))] ] tmp_table += [[ '%s by %s(%s,%s)' % (response_col, factor_col, first, second) ] + ['true ratio of variances != 1'] + [ 'F statistic, F distribution with %d numerator degrees of freedom and %d degrees of freedom under the null hypothesis.' % (d_num, d_denum) ] + [f_value] + [p_value] + [confi_level] + [ f_value / (scipy.stats.f.ppf( (1 + confi_level) / 2, d_num, d_denum)) ] + [ f_value * (scipy.stats.f.ppf( (1 + confi_level) / 2, d_denum, d_num)) ]] result_model = pd.DataFrame.from_records(tmp_model) result_model.columns = [ 'alternative_hypothesis', 'p-value', '%g%% confidence interval' % (confi_level * 100) ] rb.addMD( strip_margin(""" | #### Data = {response_col} by {factor_col}({first},{second}) | - F-value = {f_value} | | {result_model} | """.format(response_col=response_col, factor_col=factor_col, first=first, second=second, f_value=f_value, result_model=pandasDF2MD(result_model)))) result = pd.DataFrame.from_records(tmp_table) result.columns = [ 'data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'lower_confidence_interval', 'upper_confidence_interval' ] model = dict() model['_repr_brtc_'] = rb.get() return {'out_table': result, 'model': model}
def _hierarchical_clustering(table, input_cols, input_mode='original', key_col=None, link='complete', met='euclidean', num_rows=20, figure_height=6.4, orient='right'): out_table = table.copy() feature_names, features = check_col_type(out_table, input_cols) if input_mode == 'original': len_features = len(features) if key_col != None: data_names = list(out_table[key_col]) elif key_col == None: data_names = ['pt_' + str(i) for i in range(len_features)] out_table['name'] = data_names Z = linkage(ssd.pdist(features, metric=met), method=link, metric=met) elif input_mode == 'matrix': len_features = len(input_cols) if key_col != None: data_names = [] for column in input_cols: data_names.append( out_table[key_col][out_table.columns.get_loc(column)]) elif key_col == None: data_names = [] for column in input_cols: data_names.append( out_table.columns[out_table.columns.get_loc(column)]) col_index = [] for column in input_cols: col_index.append(out_table.columns.get_loc(column)) dist_matrix = features.iloc[col_index] Z = linkage(ssd.squareform(dist_matrix), method=link, metric=met) dist_matrix['name'] = data_names else: raise_runtime_error("Please check 'input_mode'.") range_len_Z = range(len(Z)) linkage_matrix = pd.DataFrame([]) linkage_matrix['linkage step'] = [ '%g' % (x + 1) for x in reversed(range_len_Z) ] linkage_matrix['name of clusters'] = [ 'CL_%g' % (i + 1) for i in reversed(range_len_Z) ] joined_column1 = [] for i in range_len_Z: if Z[:, 0][i] < len_features: joined_column1.append(data_names[int(Z[:, 0][i])]) elif Z[:, 0][i] >= len_features: joined_column1.append( linkage_matrix['name of clusters'][Z[:, 0][i] - len_features]) linkage_matrix['joined column1'] = joined_column1 joined_column2 = [] for i in range_len_Z: if Z[:, 1][i] < len_features: joined_column2.append(data_names[int(Z[:, 1][i])]) elif Z[:, 1][i] >= len_features: joined_column2.append( linkage_matrix['name of clusters'][Z[:, 1][i] - len_features]) linkage_matrix['joined column2'] = joined_column2 linkage_matrix['distance'] = [distance for distance in Z[:, 2]] linkage_matrix['number of original'] = [ int(entities) for entities in Z[:, 3] ] linkage_matrix = linkage_matrix.reindex( index=linkage_matrix.index[::-1])[0:] # calculate full dendrogram plt.figure(figsize=(8.4, figure_height)) dendrogram(Z, truncate_mode='none', get_leaves=True, orientation=orient, labels=data_names, leaf_rotation=45, leaf_font_size=10., show_contracted=False) plt.title('Hierarchical Clustering Dendrogram') if orient == 'top': plt.xlabel('Samples') plt.ylabel('Distance') elif orient == 'right': plt.xlabel('Distance') plt.ylabel('Samples') plt.tight_layout() plt2 = plt2MD(plt) plt.clf() params = { 'Input Columns': feature_names, 'Input Mode': input_mode, 'Linkage Method': link, 'Metric': met, 'Number of Rows in Linkage Matrix': num_rows } rb = BrtcReprBuilder() rb.addMD(strip_margin("""# Hierarchical Clustering Result""")) rb.addMD( strip_margin(""" |### Dendrogram | |{image} | |### Parameters | |{display_params} | |### Linkage Matrix | |{out_table1} | """.format(image=plt2, display_params=dict2MD(params), out_table1=pandasDF2MD(linkage_matrix.head(num_rows), num_rows=num_rows + 1)))) model = _model_dict('hierarchical_clustering') model['model'] = Z model['input_mode'] = input_mode model['table'] = out_table if input_mode == 'matrix': model['dist_matrix'] = dist_matrix model['parameters'] = params model['linkage_matrix'] = linkage_matrix model['_repr_brtc_'] = rb.get() return {'model': model}
def _naive_bayes_train(table, feature_cols, label_col, alpha=1.0, fit_prior=True, class_prior=None): feature_names, features = check_col_type(table, feature_cols) label = table[label_col] label_encoder = preprocessing.LabelEncoder() label_encoder.fit(label) label_correspond = label_encoder.transform(label) if class_prior is not None: tmp_class_prior = [0] * len(class_prior) for elems in class_prior: tmp = elems.split(":") tmp_class_prior[label_encoder.transform([tmp[0] ])[0]] = float(tmp[1]) class_prior = tmp_class_prior nb_model = MultinomialNB(alpha, fit_prior, class_prior) nb_model.fit(features, label_correspond) class_log_prior = nb_model.class_log_prior_ feature_log_prob_ = nb_model.feature_log_prob_ tmp_result = np.hstack( (list(map(list, zip(*[label_encoder.classes_] + [class_log_prior]))), (feature_log_prob_))) column_names = ['labels', 'pi'] for feature_col in feature_names: column_names += ['theta_' + feature_col] result_table = pd.DataFrame.from_records(tmp_result, columns=column_names) prediction_correspond = nb_model.predict(features) get_param = dict() get_param['Lambda'] = alpha # get_param['Prior Probabilities of the Classes'] = class_prior get_param['Fit Class Prior Probability'] = fit_prior get_param['Feature Columns'] = feature_names get_param['Label Column'] = label_col cnf_matrix = confusion_matrix(label_correspond, prediction_correspond) plt.figure() _plot_confusion_matrix(cnf_matrix, classes=label_encoder.classes_, title='Confusion Matrix') fig_confusion_matrix = plt2MD(plt) accuracy = nb_model.score(features, label_correspond) * 100 rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Naive Bayes Classification Result | | ### Model:Multinomial | {result_table} | ### Parameters | {table_parameter} | ### Predicted vs Actual | {image1} | #### Accuacy = {accuracy}% | """.format(image1=fig_confusion_matrix, accuracy=accuracy, result_table=pandasDF2MD(result_table), table_parameter=dict2MD(get_param)))) model = _model_dict('naive_bayes_model') model['features'] = feature_cols model['label_col'] = label_col model['label_encoder'] = label_encoder model['nb_model'] = nb_model model['_repr_brtc_'] = rb.get() return {'model': model}
def _tfidf(table, input_col, max_df=None, min_df=1, num_voca=1000, idf_weighting_scheme='inverseDocumentFrequency', norm='l2', smooth_idf=True, sublinear_tf=False, output_type=False): corpus = np.array(table[input_col]) if max_df == None: max_df = len(corpus) tf_vectorizer = CountVectorizer(stop_words='english', max_df=max_df, min_df=min_df, max_features=num_voca) tf_vectorizer.fit(corpus) csr_matrix_tf = tf_vectorizer.transform(corpus) tfidf_vectorizer = TfidfTransformer(norm=norm, use_idf=True, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf) csr_matrix_tfidf = tfidf_vectorizer.fit_transform(csr_matrix_tf) voca_dict = sorted(tf_vectorizer.vocabulary_.items(), key=itemgetter(1)) len_voca = len(voca_dict) # tf-idf table tfidf_table = pd.DataFrame() document_list = [] docID_list = [] if output_type == False: vocabulary_list = [] label_table = pd.DataFrame() for doc in range(len(corpus)): docID_list += ['doc_{}'.format(doc) for _ in range(len_voca)] document_list += [str(corpus[doc]) for _ in range(len_voca)] vocabulary_list += [voca_dict[j][0] for j in range(len_voca)] label_table['document_id'] = docID_list label_table[input_col] = document_list label_table['vocabulary'] = vocabulary_list tfidf_table = label_table tfidf_table['frequency'] = np.ravel(csr_matrix_tf.todense()) if idf_weighting_scheme == 'inverseDocumentFrequency': tfidf_table['tfidf score'] = np.ravel(csr_matrix_tfidf.todense()) elif idf_weighting_scheme == 'unary': tfidf_table['tfidf score'] = list(map(float, np.array(tfidf_table['frequency']))) elif output_type == True: for doc in range(len(corpus)): docID_list += ['doc_{}'.format(doc) for _ in range(csr_matrix_tfidf.indptr[doc + 1] - csr_matrix_tfidf.indptr[doc])] document_list += [str(corpus[doc]) for _ in range(csr_matrix_tfidf.indptr[doc + 1] - csr_matrix_tfidf.indptr[doc])] tfidf_table['document_id'] = docID_list tfidf_table[input_col] = document_list tfidf_table['vocabulary'] = [voca_dict[i][0] for i in csr_matrix_tf.indices] tfidf_table['frequency'] = csr_matrix_tf.data data_list = [] for doc in range(len(corpus)): data_list += [csr_matrix_tfidf.data[i] for i in range(csr_matrix_tfidf.indptr[doc + 1] - csr_matrix_tfidf.indptr[doc])][::-1] if idf_weighting_scheme == 'inverseDocumentFrequency': tfidf_table['tfidf score'] = data_list elif idf_weighting_scheme == 'unary': tfidf_table['tfidf score'] = list(map(float, np.array(tfidf_table['frequency']))) else: raise_runtime_error("Please check 'output_type'.") # idf table idf_table = pd.DataFrame() idf_table['vocabulary'] = [voca_dict[j][0] for j in range(len(voca_dict))] if idf_weighting_scheme == 'inverseDocumentFrequency': idf_table['idf weight'] = tfidf_vectorizer.idf_.tolist() elif idf_weighting_scheme == 'unary': idf_table['idf weight'] = float(1) params = { 'Input Column': input_col, 'Max DF': max_df, 'Min DF': min_df, 'Number of Vocabularies': num_voca, 'IDF Weighting Scheme': idf_weighting_scheme, 'Norm': norm, 'Smooth IDF': smooth_idf, 'Sublinear TF': sublinear_tf, 'Remove Zero Counts': output_type } rb = BrtcReprBuilder() rb.addMD(strip_margin("""# TF-IDF Result""")) rb.addMD(strip_margin(""" | |### Parameters | |{display_params} | |### IDF Table | |{idf_table} | |### TFIDF Table | |{tfidf_table} | """.format(display_params=dict2MD(params), idf_table=pandasDF2MD(idf_table, num_rows=200), tfidf_table=pandasDF2MD(tfidf_table, num_rows=200)))) model = _model_dict('tfidf') model['csr_matrix_tf'] = csr_matrix_tf model['csr_matrix_tfidf'] = csr_matrix_tfidf model['parameter'] = params model['idf_table'] = idf_table model['tfidf_table'] = tfidf_table model['_repr_brtc_'] = rb.get() return {'model' : model}