def agglomerative_clustering_train_predict(input_table, input_cols, n_clusters=3, affinity='euclidean', compute_full_tree=True, linkage='ward', prediction_col='prediction', figw=6.4, figh=4.8): inputarr = input_table[input_cols] agglomerative_clustering = SKAgglomerativeClustering( n_clusters=n_clusters, affinity=affinity, memory=None, connectivity=None, compute_full_tree=compute_full_tree, linkage=linkage) agglomerative_clustering.fit(inputarr) input_table[prediction_col] = agglomerative_clustering.labels_ children = agglomerative_clustering.children_ distance = np.arange(children.shape[0]) no_of_observations = np.arange(2, children.shape[0] + 2) linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float) plt.figure(figsize=(figw, figh)) dendrogram(linkage_matrix) plot_dendrogram = plt2MD(plt) plt.clf() rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Agglomerative Clustering Result | {plot_dendrogram} """.format(plot_dendrogram=plot_dendrogram))) agglomerative_clustering_result = { 'model': agglomerative_clustering, 'input_cols': input_cols, '_repr_brtc_': rb.get() } return { 'out_table': input_table, 'agglomerative_result': agglomerative_clustering_result }
def _ancova(table, response_cols, factor_col, between_col): rb = BrtcReprBuilder() rb.addMD(strip_margin(""" ## Analysis of Covariance Result """)) groups = table[between_col].unique() groups.sort() sum_len = np.sum([len(str(group)) for group in groups]) result = dict() result['_grouped_data'] = dict() for response_col in response_cols: data = table[response_col] result['_grouped_data'][response_col] = dict() ax = sns.boxplot(x=between_col, y=response_col, data=table, order=groups) if sum_len > 512: ax.set_xticklabels(ax.get_xticklabels(), rotation=90) elif sum_len > 64: ax.set_xticklabels(ax.get_xticklabels(), rotation=45) fig_box = plt2MD(plt) plt.clf() ancova_res = pg_ancova(data=table, dv=response_col, covar=factor_col, between=between_col) ancova_df = pandasDF2MD(ancova_res) rb.addMD( strip_margin(""" | ## {response_col} by {between_col} | {fig_box} | | ### ANCOVA | {ancova_df} """.format(response_col=response_col, between_col=between_col, fig_box=fig_box, ancova_df=ancova_df))) result['_repr_brtc_'] = rb.get() return {'result': result}
def _kmeans_centers_plot(input_cols, cluster_centers): sum_len_cols = np.sum([len(col) for col in input_cols]) x = range(len(input_cols)) if sum_len_cols >= 512: plt.xticks(x, input_cols, rotation='vertical') elif sum_len_cols >= 64: plt.xticks(x, input_cols, rotation=45, ha='right') else: plt.xticks(x, input_cols) for idx, centers in enumerate(cluster_centers): plt.plot(x, centers, "o-", label=idx) plt.legend() plt.tight_layout() fig_centers = plt2MD(plt) plt.clf() return fig_centers
def _plot_feature_importances(feature_cols, regressor): feature_importance = regressor.feature_importances_ indices = np.argsort(feature_importance) sorted_feature_cols = np.array(feature_cols)[indices] plt.barh(range(len(indices)), feature_importance[indices], color='b', align='center') for i, v in enumerate(feature_importance[indices]): plt.text(v, i, " {:.2f}".format(v), color='b', va='center', fontweight='bold') plt.yticks(range(len(indices)), sorted_feature_cols) plt.xlabel("Feature importance") plt.ylabel("Feature") plt.tight_layout() fig_feature_importances = plt2MD(plt) plt.close() return fig_feature_importances
def _mean_shift_pca_plot(labels, cluster_centers, pca2_model, pca2, colors): for i, color in zip(range(len(cluster_centers)), colors): plt.scatter(pca2[:, 0][labels == i], pca2[:, 1][labels == i], color=color) pca2_centers = pca2_model.transform(cluster_centers) plt.scatter(pca2_centers[:, 0], pca2_centers[:, 1], marker='x', edgecolors=1, s=100, color='red') plt.tight_layout() fig_pca = plt2MD(plt) plt.clf() return fig_pca
def _kmeans_samples_plot(table, input_cols, n_samples, cluster_centers): sum_len_cols = np.sum([len(col) for col in input_cols]) sample = table[input_cols].sample(n=n_samples) x = range(len(input_cols)) if sum_len_cols >= 512: plt.xticks(x, input_cols, rotation='vertical') elif sum_len_cols >= 64: plt.xticks(x, input_cols, rotation=45, ha='right') else: plt.xticks(x, input_cols) for idx in sample.index: plt.plot(x, sample.transpose()[idx], color='grey', linewidth=1) for idx, centers in enumerate(cluster_centers): plt.plot(x, centers, "o-", label=idx, linewidth=4) plt.tight_layout() fig_samples = plt2MD(plt) plt.clf() return fig_samples
def _screeplot(explained_variance, explained_variance_ratio, n_components, ax=None): if ax is None: ax = plt.gca() n_components_range = range(1, len(explained_variance) + 1) cum_explained_variance = explained_variance_ratio.cumsum() plt.xticks(n_components_range, n_components_range) ax.plot(n_components_range, explained_variance, 'o--') ax.set_ylabel('Explained Variance') ax2 = ax.twinx() ax2.plot(n_components_range, cum_explained_variance, 'x-') ax2.set_ylim([0, 1.05]) ax2.set_ylabel('Cumulative Explained Variance Ratio') ax2.text(n_components, cum_explained_variance[n_components - 1] - 0.05, '%0.4f' % cum_explained_variance[n_components - 1], va='center', ha='center') fig_scree = plt2MD(plt) plt.clf() return fig_scree
def _kmeans_samples_plot(table, input_cols, n_samples, cluster_centers, seed, colors): feature_names, inputarr = check_col_type(table, input_cols) sum_len_cols = np.sum([len(col) for col in feature_names]) sample = pd.DataFrame(inputarr).sample(n=n_samples, random_state=seed) x = range(len(feature_names)) if sum_len_cols >= 512: plt.xticks(x, feature_names, rotation='vertical') elif sum_len_cols >= 64: plt.xticks(x, feature_names, rotation=45, ha='right') else: plt.xticks(x, feature_names) for idx in sample.index: plt.plot(x, sample.transpose()[idx], color='grey', linewidth=1) for idx, centers in enumerate(cluster_centers): plt.plot(x, centers, "o-", label=idx, linewidth=2, color=colors[idx]) plt.tight_layout() fig_samples = plt2MD(plt) plt.clf() return fig_samples
def _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2): n_clusters = len(cluster_centers) colors = cm.nipy_spectral(np.arange(n_clusters).astype(float) / n_clusters) for i, color in zip(range(n_clusters), colors): plt.scatter(pca2[:, 0][labels == i], pca2[:, 1][labels == i], color=color) pca2_centers = pca2_model.transform(cluster_centers) plt.scatter(pca2_centers[:, 0], pca2_centers[:, 1], marker='x', edgecolors=1, s=200, color=colors) plt.tight_layout() fig_pca = plt2MD(plt) plt.clf() return fig_pca
def _spectral_clustering_samples_plot(labels, table, input_cols, n_samples, n_clusters, colors): sum_len_cols = np.sum([len(col) for col in input_cols]) sample = table[input_cols].sample( n=n_samples) if n_samples is not None else table[input_cols] x = range(len(input_cols)) if sum_len_cols >= 512: plt.xticks(x, input_cols, rotation='vertical') elif sum_len_cols >= 64: plt.xticks(x, input_cols, rotation=45, ha='right') else: plt.xticks(x, input_cols) for idx in sample.index: plt.plot(x, sample.transpose()[idx], color=colors[labels[idx]], linewidth=1) plt.tight_layout() fig_samples = plt2MD(plt) plt.clf() return fig_samples
def _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2, colors): n_clusters = len(cluster_centers) pca2_centers = pca2_model.transform(cluster_centers) if pca2.shape[1] == 1: for i, color in zip(range(n_clusters), colors): plt.scatter(pca2[:, 0][labels == i], pca2[:, 0][labels == i], color=color) plt.scatter(pca2_centers[:, 0], pca2_centers[:, 0], marker='x', edgecolors=1, s=200, color=colors) plt.xlabel("Feature space for the 1st feature") plt.ylabel("Feature space for the 1st feature") else: for i, color in zip(range(n_clusters), colors): plt.scatter(pca2[:, 0][labels == i], pca2[:, 1][labels == i], color=color) plt.scatter(pca2_centers[:, 0], pca2_centers[:, 1], marker='x', edgecolors=1, s=200, color=colors) plt.xlabel("Feature space for the 1st feature") plt.ylabel("Feature space for the 2nd feature") plt.tight_layout() fig_pca = plt2MD(plt) plt.clf() return fig_pca
def _timeseries_decomposition(table, input_col, frequency, model_type='additive', filteration=None, two_sided=True, extrapolate_trend=0): out_table = table.copy() decomposition = sm.tsa.seasonal_decompose( out_table[input_col], model=model_type, filt=filteration, freq=frequency, two_sided=two_sided, extrapolate_trend=extrapolate_trend) decomposition.plot() plt2 = plt2MD(plt) plt.clf() out_table['trend'] = decomposition.trend out_table['seasonal'] = decomposition.seasonal out_table['residual'] = decomposition.resid rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Time Series Decomposition Result | Model Type : {model_type} | | {image2} | """.format(model_type=model_type, image2=plt2))) model = _model_dict('timeseries_decomposition') model['model_type'] = model_type model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'model': model}
def _agglomerative_clustering_samples_plot(labels, table, input_cols, n_samples, n_clusters, colors): sample = table[input_cols].sample( n=n_samples) if n_samples is not None else table[input_cols] feature_names, sample = check_col_type(sample, input_cols) sum_len_cols = np.sum([len(col) for col in feature_names]) x = range(len(feature_names)) if sum_len_cols >= 512: plt.xticks(x, feature_names, rotation='vertical') elif sum_len_cols >= 64: plt.xticks(x, feature_names, rotation=45, ha='right') else: plt.xticks(x, feature_names) if feature_names == input_cols: for idx in sample.index: plt.plot(x, sample.transpose()[idx], color='grey', linewidth=1) else: for idx in range(len(sample)): plt.plot(x, sample[idx], color='grey', linewidth=1) plt.tight_layout() fig_samples = plt2MD(plt) plt.clf() return fig_samples
def _isotonic_regression_train(table, feature_col, label_col, increasing=True): if feature_col == label_col: raise BFE.from_errors([{ '0100': '{} is deplicate in Feature column and Label column'.format( feature_col) }]) features = table[feature_col] label = table[label_col] isotonic_model = IsotonicRegression(increasing=increasing) isotonic_model.fit(features, label) predict = isotonic_model.predict(features) plt.figure() plt.plot(label, 'r.-') plt.plot(predict, 'b.-') plt.xlabel('Samples') plt.legend(['True label', 'Predicted']) fig_actual_predict = plt2MD(plt) get_param = isotonic_model.get_params() rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Linear Regression Result | ### Param | {param} | ### Predicted vs Actual | {image1} """.format(image1=fig_actual_predict, param=get_param))) model = _model_dict('isotonic_regression_model') model['_repr_brtc_'] = rb.get() model['feature_col'] = feature_col model['label_col'] = label_col model['parameters'] = get_param model['regressor'] = isotonic_model return {"model": model}
def _holt_winters_predict(model, prediction_num): rb = BrtcReprBuilder() df1 = pd.DataFrame() df2 = pd.DataFrame() df1['number'] = np.arange(1, prediction_num + 1, 1) for column in model['input_columns']: df2[column] = model['hw_' + str(column)].forecast(prediction_num) reindex_df2 = df2.reset_index(drop=True) predict_table = df1.join(reindex_df2) rb.addMD( strip_margin(""" |## Holt-Winters Predict Result | """.format())) for column in model['input_columns']: plt.title(column) plt.plot(model['origin_table'][column].index, model['origin_table'][column], label='Train') plt.plot(df2[column].index, df2[column], label='Prediction') plt.legend(loc='best') rb.addMD( strip_margin(""" |{plot} | """.format(plot=plt2MD(plt)))) plt.clf() model['_repr_brtc_'] = rb.get() model['predict_table'] = predict_table return {'model': model, 'out_table': predict_table}
def _association_rule_visualization(table, option='multiple_to_single', edge_length_scaling=1, font_size=10, node_size_scaling=1, figure_size_muliplier=1, display_rule_num=False): if (option == 'single_to_single'): result_network = table.copy() length_ante = [] string_ante = [] length_conse = [] string_conse = [] for row in result_network['antecedent']: length_ante += [len(row)] string_ante += [row[0]] for row in result_network['consequent']: length_conse += [len(row)] string_conse += [row[0]] result_network['length_ante'] = length_ante result_network['string_ante'] = string_ante result_network['length_conse'] = length_conse result_network['string_conse'] = string_conse result_network = result_network[result_network.length_ante == 1] result_network = result_network[result_network.length_conse == 1] result_network['support_ante'] = result_network[ 'support'] / result_network['confidence'] result_network['support_conse'] = result_network[ 'confidence'] / result_network['lift'] #edges_colors = preprocessing.LabelEncoder() #edges_colors.fit(result_network['lift']) #edges_colors = edges_colors.transform(result_network['lift']) #result_network['edge_colors'] = edges_colors result_network = result_network.reset_index() edges = [] for i in range(len(result_network.string_ante)): edges += [(result_network.string_ante[i], result_network.string_conse[i])] G = nx.DiGraph() G.add_edges_from(edges) nodes = G.nodes() plt.figure(figsize=(4 * len(nodes)**0.5 * figure_size_muliplier, 4 * len(nodes)**0.5 * figure_size_muliplier)) pos = nx.spring_layout(G, k=0.4 * edge_length_scaling) node_tmp = list(result_network.string_ante) + list( result_network.string_conse) support_tmp = list(result_network.support_ante) + list( result_network.support_conse) tmp_node_support = [] for i in range(len(node_tmp)): tmp_node_support += [[node_tmp[i], support_tmp[i]]] nodes_table = pd.DataFrame.from_records(tmp_node_support, columns=['name', 'support']) nodes_table = nodes_table.drop_duplicates(['name']) node_color = [] nodes_table = nodes_table.reset_index() scaled_support = _scaling(nodes_table.support) for node in nodes: for i in range(len(nodes_table.name)): if nodes_table.name[i] == node: node_color += [ scaled_support[i] * 2500 * node_size_scaling ] break #if(scaling==True): # edge_color = [result_network['edge_colors'][n] for n in range(len(result_network['length_conse']))] #else: scaled_support = _scaling(result_network['confidence']) edge_size = [ scaled_support[n] * 8 for n in range(len(result_network['length_conse'])) ] edge_color = [ result_network['lift'][n] for n in range(len(result_network['length_conse'])) ] nx.draw(G, pos, node_color=node_color, edge_color=edge_color, node_size=node_color, arrowsize=20 * (0.2 + 0.8 * node_size_scaling), font_family='NanumGothic', with_labels=True, cmap=plt.cm.Blues, edge_cmap=plt.cm.Reds, arrows=True, edge_size=edge_color, width=edge_size, font_size=font_size) fig_digraph = plt2MD(plt) graph_min_support = np.min(nodes_table.support) graph_max_support = np.max(nodes_table.support) graph_min_confidence = np.min(result_network['confidence']) graph_max_confidence = np.max(result_network['confidence']) graph_min_lift = np.min(result_network['lift']) graph_max_lift = np.max(result_network['lift']) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ### Network Digraph | ##### Node color, size : support ({graph_min_support}~{graph_max_support}) | ##### Edge color : lift ({graph_min_lift}~{graph_max_lift}) | ##### Edge size : confidence ({graph_min_confidence}~{graph_max_confidence}) | {image1} | """.format(image1=fig_digraph, graph_min_support=graph_min_support, graph_max_support=graph_max_support, graph_min_lift=graph_min_lift, graph_max_lift=graph_max_lift, graph_min_confidence=graph_min_confidence, graph_max_confidence=graph_max_confidence))) elif (option == 'multiple_to_single'): result_network = table.copy() length_ante = [] string_ante = [] length_conse = [] string_conse = [] for row in result_network['consequent']: length_conse += [len(row)] string_conse += [row[0]] result_network['length_conse'] = length_conse result_network['consequent'] = string_conse result_network = result_network[result_network.length_conse == 1] index_list = result_network.index.tolist() rownum = [] for i in range(len(result_network['consequent'])): if display_rule_num: rownum += ['R%d' % (i + 1)] else: rownum += [_n_blank_strings(i + 1)] result_network['row_number'] = rownum edges = [] nodes = [] for i in index_list: for j in range(len(result_network.antecedent[i])): edges += [(result_network.antecedent[i][j], result_network['row_number'][i])] edges += [(result_network['row_number'][i], result_network.consequent[i])] nodes += [result_network['row_number'][i]] G = nx.DiGraph() G.add_nodes_from(nodes) G.add_edges_from(edges) plt.figure(figsize=(2 * len(nodes)**0.5 * figure_size_muliplier, 2 * len(nodes)**0.5 * figure_size_muliplier)) pos = nx.spring_layout(G, k=0.2 * edge_length_scaling) nodes_color = [] nodes_size = [] scaled_lift = _scaling(result_network.lift) for node in range(len(G.nodes())): if node < len(nodes): nodes_color += [result_network.support[index_list[node]]] nodes_size += [scaled_lift[node] * 2000 * node_size_scaling] else: nodes_color += [0] nodes_size += [0] nx.draw(G, pos, node_color=nodes_color, node_size=nodes_size, font_family='NanumGothic', with_labels=True, cmap=plt.cm.Reds, arrows=True, edge_color='Grey', font_weight='bold', arrowsize=20 * (0.2 + 0.8 * node_size_scaling), font_size=font_size) fig_digraph = plt2MD(plt) graph_min_support = np.min(result_network.support) graph_max_support = np.max(result_network.support) graph_min_lift = np.min(result_network.lift) graph_max_lift = np.max(result_network.lift) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ### Network Digraph | ##### Size of circle : support ({graph_min_support}~{graph_max_support}) | ##### Color of circle : lift ({graph_min_lift}~{graph_max_lift}) | {image1} | """.format(image1=fig_digraph, graph_min_support=graph_min_support, graph_max_support=graph_max_support, graph_min_lift=graph_min_lift, graph_max_lift=graph_max_lift))) else: result_network = table.copy() length_ante = [] string_ante = [] length_conse = [] string_conse = [] for row in result_network['consequent']: length_conse += [len(row)] result_network['length_conse'] = length_conse result_network = result_network.reset_index() rownum = [] for i in range(len(result_network['consequent'])): if display_rule_num: rownum += ['R%d' % i] else: rownum += [_n_blank_strings(i + 1)] result_network['row_number'] = rownum edges = [] nodes = [] for i in range(len(result_network.consequent)): for j in range(len(result_network.antecedent[i])): edges += [(result_network.antecedent[i][j], result_network['row_number'][i])] for j in range(len(result_network.consequent[i])): edges += [(result_network['row_number'][i], result_network.consequent[i][j])] nodes += [result_network['row_number'][i]] G = nx.DiGraph() G.add_nodes_from(nodes) G.add_edges_from(edges) plt.figure(figsize=(2 * len(nodes)**0.5 * figure_size_muliplier, 2 * len(nodes)**0.5 * figure_size_muliplier)) pos = nx.spring_layout(G, k=0.2 * edge_length_scaling) nodes_color = [] nodes_size = [] scaled_lift = _scaling(result_network.lift) for node in range(len(G.nodes())): if node < len(nodes): nodes_color += [result_network.support[node]] nodes_size += [scaled_lift[node] * 2000 * node_size_scaling] else: nodes_color += [0] nodes_size += [0] nx.draw(G, pos, node_color=nodes_color, node_size=nodes_size, font_family='NanumGothic', with_labels=True, cmap=plt.cm.Reds, arrows=True, edge_color='Grey', font_weight='bold', arrowsize=20 * (0.2 + 0.8 * node_size_scaling), font_size=font_size) fig_digraph = plt2MD(plt) graph_min_support = np.min(result_network.support) graph_max_support = np.max(result_network.support) graph_min_lift = np.min(result_network.lift) graph_max_lift = np.max(result_network.lift) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ### Network Digraph | ##### Size of circle : support ({graph_min_support}~{graph_max_support}) | ##### Color of circle : lift ({graph_min_lift}~{graph_max_lift}) | {image1} | """.format(image1=fig_digraph, graph_min_support=graph_min_support, graph_max_support=graph_max_support, graph_min_lift=graph_min_lift, graph_max_lift=graph_max_lift))) model = _model_dict('Association rule') model['_repr_brtc_'] = rb.get() return {'model': model}
def _autocorrelation(table, input_col, nlags=20, conf_level=0.95): data = table[input_col] plt.figure() plot_acf(data, lags=nlags, alpha=1 - conf_level) fig_plt_acf = plt2MD(plt) plt.clf() plt.figure() plot_pacf(data, lags=nlags, alpha=1 - conf_level) fig_plt_pacf = plt2MD(plt) plt.clf() acf_ret = acf(data, nlags=nlags, alpha=1 - conf_level) pacf_ret = pacf(data, nlags=nlags, alpha=1 - conf_level) result_table1 = pd.DataFrame([]) result_table1['lag'] = list(range(nlags + 1)) result_table1['ACF'] = acf_ret[0] if conf_level is not None: result_table1['%g%% confidence Interval' % (conf_level * 100)] = [ str((acf_ret[1][i][0], acf_ret[1][i][1])) for i in range(nlags + 1) ] result_table2 = pd.DataFrame([]) result_table2['lag'] = list(range(nlags + 1)) result_table2['PACF'] = pacf_ret[0] if conf_level is not None: result_table2['%g%% confidence Interval' % (conf_level * 100)] = [ str((pacf_ret[1][i][0], pacf_ret[1][i][1])) for i in range(nlags + 1) ] rb = BrtcReprBuilder() rb.addMD( strip_margin("""# Autocorrelation / Partial Autocorrelation Result""")) rb.addMD( strip_margin(""" |## Autocorrelation | |{image1} | |### Autocorrelation Table | |{result_table1} | |## Partial Autocorrelation | |{image2} | |### Partial Autocorrelation Table | |{result_table2} | """.format(image1=fig_plt_acf, result_table1=pandasDF2MD(result_table1, num_rows=nlags + 1), image2=fig_plt_pacf, result_table2=pandasDF2MD(result_table2, num_rows=nlags + 1)))) model = _model_dict('autocorrelation') model['autocorrelation_table'] = result_table1 model['partial_autocorrelation_table'] = result_table2 model['_repr_brtc_'] = rb.get() return {'model': model}
def _decision_tree_regression_train( table, feature_cols, label_col, # fig_size=np.array([6.4, 4.8]), criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, presort=False, sample_weight=None, check_input=True, X_idx_sorted=None): param_validation_check = [ greater_than_or_equal_to(min_samples_split, 2, 'min_samples_split'), greater_than_or_equal_to(min_samples_leaf, 1, 'min_samples_leaf'), greater_than_or_equal_to(min_weight_fraction_leaf, 0.0, 'min_weight_fraction_leaf') ] if max_depth is not None: param_validation_check.append( greater_than_or_equal_to(max_depth, 1, 'max_depth')) validate(*param_validation_check) regressor = DecisionTreeRegressor(criterion, splitter, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, random_state, max_leaf_nodes, min_impurity_decrease, min_impurity_split, presort) regressor.fit(table[feature_cols], table[label_col], sample_weight, check_input, X_idx_sorted) try: from sklearn.externals.six import StringIO from sklearn.tree import export_graphviz import pydotplus dot_data = StringIO() export_graphviz(regressor, out_file=dot_data, feature_names=feature_cols, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) from brightics.common.repr import png2MD fig_tree = png2MD(graph.create_png()) except: fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer." # json model = _model_dict('decision_tree_regression_model') model['feature_cols'] = feature_cols model['label_col'] = label_col feature_importance = regressor.feature_importances_ model['feature_importance'] = feature_importance model['max_features'] = regressor.max_features_ model['n_features'] = regressor.n_features_ model['n_outputs'] = regressor.n_outputs_ model['tree'] = regressor.tree_ get_param = regressor.get_params() model['parameters'] = get_param model['regressor'] = regressor # report indices = np.argsort(feature_importance) sorted_feature_cols = np.array(feature_cols)[indices] plt.title('Feature Importances') plt.barh(range(len(indices)), feature_importance[indices], color='b', align='center') for i, v in enumerate(feature_importance[indices]): plt.text(v, i, " {:.2f}".format(v), color='b', va='center', fontweight='bold') plt.yticks(range(len(indices)), sorted_feature_cols) plt.xlabel('Relative Importance') plt.tight_layout() fig_feature_importances = plt2MD(plt) plt.clf() params = dict2MD(get_param) feature_importance_df = pd.DataFrame(data=feature_importance, index=feature_cols).T # Add tree plot rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Decision Tree Regression Train Result | ### Decision Tree | {fig_tree} | | ### Feature Importance | {fig_feature_importances} | | ### Parameters | {list_parameters} | """.format(fig_tree=fig_tree, fig_feature_importances=fig_feature_importances, list_parameters=params))) model['_repr_brtc_'] = rb.get() return {'model': model}
def _penalized_linear_regression_train(table, feature_cols, label_col, regression_type='ridge', alpha=1.0, l1_ratio=0.5, fit_intercept=True, max_iter=1000, tol=0.0001, random_state=None): out_table = table.copy() feature_names, features = check_col_type(out_table, feature_cols) label = out_table[label_col] if regression_type == 'ridge': regression_model = Ridge(alpha=alpha, fit_intercept=fit_intercept, max_iter=None, tol=tol, solver='auto', random_state=random_state) elif regression_type == 'lasso': regression_model = Lasso(alpha=alpha, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, random_state=random_state, selection='random') elif regression_type == 'elastic_net': regression_model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, random_state=random_state, selection='random') else: raise_runtime_error("Please check 'regression_type'.") regression_model.fit(features, label) out_table1 = pd.DataFrame([]) out_table1['x_variable_name'] = [variable for variable in feature_names] out_table1['coefficient'] = regression_model.fit(features, label).coef_ intercept = pd.DataFrame( [['intercept', regression_model.fit(features, label).intercept_]], columns=['x_variable_name', 'coefficient']) if fit_intercept == True: out_table1 = out_table1.append(intercept, ignore_index=True) predict = regression_model.predict(features) residual = label - predict out_table['predict'] = predict out_table['residual'] = residual if regression_type == 'elastic_net': params = { 'Feature Columns': feature_names, 'Label Column': label_col, 'Regression Type': regression_type, 'Regularization (Penalty Weight)': alpha, 'L1 Ratio': l1_ratio, 'Fit Intercept': fit_intercept, 'Maximum Number of Iterations': max_iter, 'Tolerance': tol } else: params = { 'Feature Columns': feature_names, 'Label Column': label_col, 'Regression Type': regression_type, 'Regularization (Penalty Weight)': alpha, 'Fit Intercept': fit_intercept, 'Maxium Number of Iterations': max_iter, 'Tolerance': tol } score = { 'MSE': mean_squared_error(label, predict), 'R2': r2_score(label, predict) } plt.figure() plt.scatter(predict, label) plt.xlabel('Predicted values for ' + label_col) plt.ylabel('Actual values for ' + label_col) x = predict p1x = np.min(x) p2x = np.max(x) plt.plot([p1x, p2x], [p1x, p2x], 'r--') fig_actual_predict = plt2MD(plt) plt.clf() plt.figure() plt.scatter(predict, residual) plt.xlabel('Predicted values for ' + label_col) plt.ylabel('Residuals') plt.axhline(y=0, color='r', linestyle='--') fig_residual_1 = plt2MD(plt) plt.clf() plt.figure() sm.qqplot(residual, line='s') plt.ylabel('Residuals') fig_residual_2 = plt2MD(plt) plt.clf() plt.figure() sns.distplot(residual) plt.xlabel('Residuals') fig_residual_3 = plt2MD(plt) plt.clf() # checking the magnitude of coefficients plt.figure() predictors = feature_names coef = Series(regression_model.coef_, predictors).sort_values() coef.plot(kind='bar', title='Model Coefficients') plt.tight_layout() fig_model_coefficients = plt2MD(plt) plt.clf() rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | # Penalized Linear Regression Result | ### Selected Parameters: | {params} | | ## Results | ### Model Parameters | {out_table1} | | ### Regression Score | {score} | """.format(params=dict2MD(params), out_table1=pandasDF2MD(out_table1), score=dict2MD(score)))) rb.addMD( strip_margin(""" | | ### Predicted vs Actual | {image1} | | ### Fit Diagnostics | {image2} | {image3} | {image4} | | ### Magnitude of Coefficients | {image5} | """.format(image1=fig_actual_predict, image2=fig_residual_1, image3=fig_residual_2, image4=fig_residual_3, image5=fig_model_coefficients))) model = _model_dict('penalized_linear_regression_model') model['feature_cols'] = feature_cols model['label_col'] = label_col model['regression_type'] = regression_type model['regression_model'] = regression_model model['parameters'] = params model['model_parameters'] = out_table1 model['prediction_residual'] = out_table model['_repr_brtc_'] = rb.get() return {'model': model}
def _hierarchical_clustering(table, input_cols, input_mode='original', key_col=None, link='complete', met='euclidean', num_rows=20, figure_height=6.4, orient='right'): out_table = table.copy() features = out_table[input_cols] if input_mode == 'original': len_features = len(features) if key_col != None: data_names = list(out_table[key_col]) elif key_col == None: data_names = ['pt_' + str(i) for i in range(len_features)] out_table['name'] = data_names Z = linkage(ssd.pdist(features, metric=met), method=link, metric=met) elif input_mode == 'matrix': len_features = len(input_cols) if key_col != None: data_names = [] for column in input_cols: data_names.append( out_table[key_col][out_table.columns.get_loc(column)]) elif key_col == None: data_names = [] for column in input_cols: data_names.append( out_table.columns[out_table.columns.get_loc(column)]) col_index = [] for column in input_cols: col_index.append(out_table.columns.get_loc(column)) dist_matrix = features.iloc[col_index] Z = linkage(ssd.squareform(dist_matrix), method=link, metric=met) dist_matrix['name'] = data_names else: raise_runtime_error("Please check 'input_mode'.") range_len_Z = range(len(Z)) linkage_matrix = pd.DataFrame([]) linkage_matrix['linkage step'] = [ '%g' % (x + 1) for x in reversed(range_len_Z) ] linkage_matrix['name of clusters'] = [ 'CL_%g' % (i + 1) for i in reversed(range_len_Z) ] joined_column1 = [] for i in range_len_Z: if Z[:, 0][i] < len_features: joined_column1.append(data_names[int(Z[:, 0][i])]) elif Z[:, 0][i] >= len_features: joined_column1.append( linkage_matrix['name of clusters'][Z[:, 0][i] - len_features]) linkage_matrix['joined column1'] = joined_column1 joined_column2 = [] for i in range_len_Z: if Z[:, 1][i] < len_features: joined_column2.append(data_names[int(Z[:, 1][i])]) elif Z[:, 1][i] >= len_features: joined_column2.append( linkage_matrix['name of clusters'][Z[:, 1][i] - len_features]) linkage_matrix['joined column2'] = joined_column2 linkage_matrix['distance'] = [distance for distance in Z[:, 2]] linkage_matrix['number of original'] = [ int(entities) for entities in Z[:, 3] ] linkage_matrix = linkage_matrix.reindex( index=linkage_matrix.index[::-1])[0:] # calculate full dendrogram plt.figure(figsize=(8.4, figure_height)) dendrogram(Z, truncate_mode='none', get_leaves=True, orientation=orient, labels=data_names, leaf_rotation=45, leaf_font_size=10., show_contracted=False) plt.title('Hierarchical Clustering Dendrogram') if orient == 'top': plt.xlabel('Samples') plt.ylabel('Distance') elif orient == 'right': plt.xlabel('Distance') plt.ylabel('Samples') plt.tight_layout() plt2 = plt2MD(plt) plt.clf() params = { 'Input Columns': input_cols, 'Input Mode': input_mode, 'Linkage Method': link, 'Metric': met, 'Number of Rows in Linkage Matrix': num_rows } rb = BrtcReprBuilder() rb.addMD(strip_margin("""# Hierarchical Clustering Result""")) rb.addMD( strip_margin(""" |### Dendrogram | |{image} | |### Parameters | |{display_params} | |### Linkage Matrix | |{out_table1} | """.format(image=plt2, display_params=dict2MD(params), out_table1=pandasDF2MD(linkage_matrix.head(num_rows), num_rows=num_rows + 1)))) model = _model_dict('hierarchical_clustering') model['model'] = Z model['input_mode'] = input_mode model['table'] = out_table if input_mode == 'matrix': model['dist_matrix'] = dist_matrix model['parameters'] = params model['linkage_matrix'] = linkage_matrix model['_repr_brtc_'] = rb.get() return {'model': model}
def _linear_regression_train(table, feature_cols, label_col, fit_intercept=True, is_vif=False, vif_threshold=10): feature_names, features = check_col_type(table, feature_cols) label = table[label_col] if fit_intercept == True: features = sm.add_constant(features, has_constant='add') lr_model_fit = sm.OLS(label, features).fit() else: lr_model_fit = sm.OLS(label, features).fit() predict = lr_model_fit.predict(features) residual = label - predict summary = lr_model_fit.summary() summary_tables = simple_tables2df_list(summary.tables, drop_index=True) summary0 = summary_tables[0] summary1 = summary_tables[1] if type(features) != type(table): features = pd.DataFrame(features) if is_vif: summary1['VIF'] = [ variance_inflation_factor(features.values, i) for i in range(features.shape[1]) ] summary1['VIF>{}'.format(vif_threshold)] = summary1['VIF'].apply( lambda _: 'true' if _ > vif_threshold else 'false') summary.tables[1] = _df_to_simpletable(summary1) summary2 = summary_tables[2] html_result = summary.as_html() plt.figure() plt.scatter(predict, label) plt.xlabel('Predicted values for ' + label_col) plt.ylabel('Actual values for ' + label_col) x = predict p1x = np.min(x) p2x = np.max(x) plt.plot([p1x, p2x], [p1x, p2x], 'r--') fig_actual_predict = plt2MD(plt) plt.figure() plt.scatter(predict, residual) plt.xlabel('Predicted values for ' + label_col) plt.ylabel('Residuals') plt.axhline(y=0, color='r', linestyle='--') fig_residual_1 = plt2MD(plt) plt.figure() sm.qqplot(residual, line='s') plt.ylabel('Residuals') fig_residual_2 = plt2MD(plt) plt.figure() sns.distplot(residual) plt.xlabel('Residuals') fig_residual_3 = plt2MD(plt) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Linear Regression Result | ### Summary | """)) rb.addHTML(html_result) rb.addMD( strip_margin(""" | | ### Predicted vs Actual | {image1} | | ### Fit Diagnostics | {image2} | {image3} | {image4} """.format(image1=fig_actual_predict, image2=fig_residual_1, image3=fig_residual_2, image4=fig_residual_3))) model = _model_dict('linear_regression_model') model['features'] = feature_cols model['label'] = label_col model['coefficients'] = lr_model_fit.params model['fit_intercept'] = fit_intercept model['r2'] = lr_model_fit.rsquared model['adjusted_r2'] = lr_model_fit.rsquared_adj model['aic'] = lr_model_fit.aic model['bic'] = lr_model_fit.bic model['f_static'] = lr_model_fit.fvalue model['tvalues'] = lr_model_fit.tvalues model['pvalues'] = lr_model_fit.pvalues model['_repr_brtc_'] = rb.get() model['summary0'] = summary0 model['summary1'] = summary1 model['summary2'] = summary2 lr_model_fit.remove_data() model['lr_model'] = lr_model_fit return {'model': model}
def _kmeans_silhouette_train_predict(table, input_cols, n_clusters_list=range(2, 10), prediction_col='prediction', init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto', seed=None, n_jobs=1, algorithm='auto', n_samples=None): feature_names, features = check_col_type(table, input_cols) if n_samples is None: n_samples = len(table) inputarr = features pca2_model = PCA(n_components=min(2, len(feature_names))).fit(inputarr) pca2 = pca2_model.transform(inputarr) silhouette_list = [] models = [] centers_list = [] images = [] for k in n_clusters_list: k_means = SKKMeans(n_clusters=k, init=init, n_init=n_init, max_iter=max_iter, tol=tol, precompute_distances=precompute_distances, verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm) k_means.fit(inputarr) models.append(k_means) predict = k_means.labels_ centersk = k_means.cluster_centers_ centers_list.append(centersk) score = silhouette_score(inputarr, predict) silhouette_list.append(score) samples = silhouette_samples(inputarr, predict) # silhouette_samples_list.append(samples) pca2_centers = pca2_model.transform(centersk) _, (ax1, ax2) = plt.subplots(1, 2) colors = cm.nipy_spectral(np.arange(k).astype(float) / k) y_lower = 0 for i, color in zip(range(k), colors): si = samples[predict == i] si.sort() sizei = si.shape[0] y_upper = y_lower + sizei ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, si, facecolor=color, edgecolor=color, alpha=0.7) # cluster label ax1.text(0.9, y_lower + 0.45 * sizei, str(i)) y_lower = y_upper if pca2.shape[1] == 1: ax2.scatter(pca2[:, 0][predict == i], pca2[:, 0][predict == i], color=color) else: ax2.scatter(pca2[:, 0][predict == i], pca2[:, 1][predict == i], color=color) ax1.axvline(x=score, color="red") ax1.set_xlim(right=1.0) ax1.set_yticks([]) ax1.set_xlabel("Silhouette coefficient values") ax1.set_ylabel("Cluster label") if pca2.shape[1] == 1: ax2.scatter(pca2_centers[:, 0], pca2_centers[:, 0], marker='x', edgecolors=1, s=200, color=colors) ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 1st feature") else: ax2.scatter(pca2_centers[:, 0], pca2_centers[:, 1], marker='x', edgecolors=1, s=200, color=colors) ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") plt.tight_layout() imagek = plt2MD(plt) plt.clf() images.append(imagek) argmax = np.argmax(silhouette_list) best_k = n_clusters_list[argmax] best_model = models[argmax] predict = best_model.predict(inputarr) best_centers = best_model.cluster_centers_ best_labels = best_model.labels_ best_sse = best_model.inertia_ n_clusters = len(best_centers) colors = cm.nipy_spectral(np.arange(n_clusters).astype(float) / n_clusters) fig_centers = _kmeans_centers_plot(feature_names, best_centers, colors) fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, best_centers, seed, colors) fig_pca = _kmeans_pca_plot(predict, best_centers, pca2_model, pca2, colors) x_clusters = range(len(n_clusters_list)) plt.xticks(x_clusters, n_clusters_list) plt.plot(x_clusters, silhouette_list, '.-') plt.xlabel("Number of Clusters k") plt.tight_layout() fig_silhouette = plt2MD(plt) plt.clf() rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Kmeans Silhouette Result | - silhoutte metrics: | {fig_silhouette} | - best K: {best_k} | - Sum of square error: {best_sse}. | - best centers: | {fig_pca} | {fig_centers} | {fig_samples} | """.format(fig_silhouette=fig_silhouette, best_k=best_k, best_sse=best_sse, fig_pca=fig_pca, fig_centers=fig_centers, fig_samples=fig_samples))) for k, image in zip(n_clusters_list, images): rb.addMD(strip_margin(""" | ### k = {k} | {image} | """.format(k=k, image=image))) model = _model_dict('kmeans_silhouette') model['best_k'] = best_k model['best_centers'] = best_centers model['best_model'] = best_model model['input_cols'] = input_cols model['_repr_brtc_'] = rb.get() out_table = table.copy() out_table[prediction_col] = predict # out_table['silhouette'] = silhouette_samples_list[best_k-2] # out_table = out_table.sort_values(by=['prediction','silhouette']) # out_table = out_table.reset_index(drop=True) return {'out_table': out_table, 'model': model}
def _evaluate_classification(table, label_col, prediction_col, average="weighted"): if average == 'None': average = None label = table[label_col] predict = table[prediction_col] # compute metrics accuracy = accuracy_score(label, predict) f1 = f1_score(label, predict, average=average) precision = precision_score(label, predict, average=average) recall = recall_score(label, predict, average=average) class_names = np.unique(np.union1d(label.values, predict.values)) # Plot non-normalized confusion matrix plt.figure() _plot_confusion_matrix(label, predict, classes=class_names, title='Confusion matrix, without normalization') fig_cnf_matrix = plt2MD(plt) # Plot normalized confusion matrix plt.figure() _plot_confusion_matrix(label, predict, classes=class_names, normalize=True, title='Normalized confusion matrix') fig_cnf_matrix_normalized = plt2MD(plt) plt.clf() # json summary = dict() summary['label_col'] = label_col summary['prediction_col'] = prediction_col summary['f1_score'] = f1 summary['accuracy_score'] = accuracy summary['precision_score'] = precision summary['recall_score'] = recall # report if average == 'weighted' or average == 'macro': all_dict_list = [{'f1': f1, 'precision': precision, 'recall': recall}] all_df = pd.DataFrame(all_dict_list) all_df = all_df[['f1', 'precision', 'recall']] else: all_dict_list = [f1, precision, recall] all_df = pd.DataFrame(all_dict_list) all_df = all_df.transpose() all_df.columns = ['f1', 'precision', 'recall'] all_df['label'] = set(label) all_df = all_df[['label'] + all_df.columns[:-1].tolist()] summary['metrics'] = all_df rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Evaluate Classification Result | | ### Accuracy : {accuracy} | | ### Metrics | {table1} | | ### Confusion matrix | {fig_confusion_matrix} | | {fig_confusion_matrix_normalized} | """.format(accuracy=accuracy, table1=pandasDF2MD(all_df), fig_confusion_matrix=fig_cnf_matrix, fig_confusion_matrix_normalized=fig_cnf_matrix_normalized))) summary['_repr_brtc_'] = rb.get() return {'result': summary}
def _pca(table, input_cols, new_column_name='projected_', n_components=None, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', seed=None, hue=None, alpha=0, key_col=None): num_feature_cols = len(input_cols) if n_components is None: n_components = num_feature_cols pca = PCA(None, copy, whiten, svd_solver, tol, iterated_power, random_state=seed) pca_model = pca.fit(table[input_cols]) column_names = [] for i in range(0, n_components): column_names.append(new_column_name + str(i)) # print(column_names) pca_result = pca_model.transform(table[input_cols]) out_df = pd.DataFrame(data=pca_result[:, :n_components], columns=[column_names]) out_df = pd.concat([table.reset_index(drop=True), out_df], axis=1) out_df.columns = table.columns.values.tolist() + column_names res_components = pca_model.components_ res_components_df = pd.DataFrame(data=res_components[:n_components], columns=[input_cols]) res_explained_variance = pca_model.explained_variance_ res_explained_variance_ratio = pca_model.explained_variance_ratio_ res_singular_values = pca_model.singular_values_ res_mean = pca_model.mean_ res_n_components = pca_model.n_components_ res_noise_variance = pca_model.noise_variance_ res_get_param = pca_model.get_params() res_get_covariance = pca_model.get_covariance() res_get_precision = pca_model.get_precision() # visualization plt.figure() if n_components == 1: sns.scatterplot(column_names[0], column_names[0], hue=hue, data=out_df) plt_two = plt2MD(plt) plt.clf() else: plt_two = _biplot( 0, 1, pc_columns=column_names, columns=input_cols, singular_values=res_singular_values, components=res_components, explained_variance_ratio=res_explained_variance_ratio, alpha=alpha, hue=hue, data=out_df, ax=plt.gca(), key_col=key_col) plt.figure() fig_scree = _screeplot(res_explained_variance, res_explained_variance_ratio, n_components) table_explained_variance = pd.DataFrame(res_explained_variance, columns=['explained_variance']) table_explained_variance[ 'explained_variance_ratio'] = res_explained_variance_ratio table_explained_variance[ 'cum_explained_variance_ratio'] = res_explained_variance_ratio.cumsum( ) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## PCA Result | ### Plot | {image1} | | ### Explained Variance | {fig_scree} | {table_explained_variance} | | ### Components | {table2} | | ### Parameters | {parameter1} """.format(image1=plt_two, fig_scree=fig_scree, table_explained_variance=pandasDF2MD(table_explained_variance), parameter1=dict2MD(res_get_param), table2=pandasDF2MD(res_components_df)))) model = _model_dict('pca') model['components'] = res_components model['explained_variance'] = res_explained_variance model['explained_variance_ratio'] = res_explained_variance_ratio model['singular_values'] = res_singular_values model['mean'] = res_mean model['n_components'] = res_n_components model['noise_variance'] = res_noise_variance model['parameters'] = res_get_param model['covariance'] = res_get_covariance model['precision'] = res_get_precision model['_repr_brtc_'] = rb.get() model['pca_model'] = pca_model model['input_cols'] = input_cols return {'out_table': out_df, 'model': model}
def _biplot(xidx, yidx, data, pc_columns, columns, singular_values, components, explained_variance_ratio, alpha=1, ax=None, hue=None, key_col=None): if ax is None: ax = plt.gca() xs = data[pc_columns[xidx]] * singular_values[xidx]**alpha ys = data[pc_columns[yidx]] * singular_values[yidx]**alpha if key_col is not None and hue is not None: groups = data[hue].unique() k = len(data[hue].unique()) colors = cm.viridis(np.arange(k).astype(float) / k) for j, color in zip(range(k), colors): group_data = data[data[hue] == groups[j]] for idx in group_data.index: ax.text(xs[idx], ys[idx], data[key_col][idx], color=color, va='center', ha='center') ax.legend([Patch(color=colors[i]) for i, _ in enumerate(groups)], groups.tolist()) elif key_col is not None and hue is None: for i in range(data.shape[0]): ax.text(xs[i], ys[i], data[key_col][i], color='black', va='center', ha='center') elif hue is not None: sns.scatterplot(xs, ys, hue=data[hue], data=data, ax=ax) else: sns.scatterplot(xs, ys, data=data, ax=ax) ax.set_xlabel('%s (%0.4f)' % (pc_columns[xidx], explained_variance_ratio[xidx])) ax.set_ylabel('%s (%0.4f)' % (pc_columns[yidx], explained_variance_ratio[yidx])) axs = components[xidx] * singular_values[xidx]**(1 - alpha) ays = components[yidx] * singular_values[yidx]**(1 - alpha) xmax = np.amax(np.concatenate((xs, axs * 1.5))) xmin = np.amin(np.concatenate((xs, axs * 1.5))) ymax = np.amax(np.concatenate((ys, ays * 1.5))) ymin = np.amin(np.concatenate((ys, ays * 1.5))) for i, col in enumerate(columns): x, y = axs[i], ays[i] ax.arrow(0, 0, x, y, color='r', width=0.001, head_width=0.05) ax.text(x * 1.3, y * 1.3, col, color='r', ha='center', va='center') ys, ye = ax.get_ylim() xs, xe = ax.get_xlim() m = 1.2 ax.set_xlim(xmin * m, xmax * m) ax.set_ylim(ymin * m, ymax * m) # plt.title('PCA result with two components') # plt.show() plt_two = plt2MD(plt) plt.clf() return plt_two
def _plot_binary(label, probability, threshold=None, fig_size=(6.4, 4.8), pos_label=None): fpr, tpr, threshold_roc = roc_curve(label, probability, pos_label=pos_label) # tpf 1-fpr if threshold is None: argmin = np.argmin(np.abs(tpr + fpr - 1)) threshold = threshold_roc[argmin] fpr_prop = fpr[argmin] tpr_prop = tpr[argmin] plt.plot(threshold_roc, tpr, color='blue', label='TPR') plt.plot(threshold_roc, 1 - fpr, color='red', label='1-FPR') plt.xlabel('Threshold') plt.ylabel('TPR or 1-FPR') plt.legend(loc="lower center") plt.axvline(threshold, linestyle='--') plt.text(threshold + 0.02, 0.5, 'threshold: %0.2f' % threshold, rotation=90, verticalalignment='center') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) fig_tpr_fpr = plt2MD(plt) plt.clf() # roc auc_score = auc(fpr, tpr) plt.figure(figsize=fig_size) plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.2f)' % auc_score) plt.plot([0, 1], [0, 1], color='navy', linestyle='--') plt.plot(fpr_prop, tpr_prop, 'g*', markersize=10, color="red", label='threshold: %0.2f' % threshold) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") fig_roc = plt2MD(plt) plt.clf() # pr precision, recall, threshold_pr = precision_recall_curve( label, probability, pos_label=pos_label) precision_prop = precision[argmin] recall_prop = recall[argmin] step_kwargs = ({ 'step': 'post' } if 'step' in signature(plt.fill_between).parameters else {}) plt.step(recall, precision, color='b', alpha=0.2, where='post') plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs) plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.plot(recall_prop, precision_prop, 'g*', markersize=10, color="red", label='threshold: %0.2f' % threshold) plt.title('Precision-Recall curve') # TODO Average precision score plt.legend() fig_pr = plt2MD(plt) plt.clf() threshold_pr = np.append(threshold_pr, 1) plt.plot(threshold_pr, precision, color='blue', label='Precision') plt.plot(threshold_pr, recall, color='red', label='Recall') plt.xlabel('Threshold') plt.ylabel('Precision or Recall') plt.legend(loc="lower center") plt.axvline(threshold, linestyle='--') plt.text(threshold + 0.02, 0.5, 'threshold: %0.2f' % threshold, rotation=90, verticalalignment='center') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) fig_precision_recall = plt2MD(plt) plt.clf() classes = label.unique() neg_label = [cls for cls in classes if cls != pos_label][0] predict = probability.apply(lambda x: pos_label if x >= threshold else neg_label) _plot_confusion_matrix(label, predict, [pos_label, neg_label], normalize=False, title='Confusion matrix', cmap=plt.cm.Blues) fig_confusion = plt2MD(plt) plt.clf() return threshold, fig_tpr_fpr, fig_roc, fig_precision_recall, fig_pr, fig_confusion
def _decision_tree_classification_train( table, feature_cols, label_col, # fig_size=np.array([6.4, 4.8]), criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=False, sample_weight=None, check_input=True, X_idx_sorted=None): y_train = table[label_col] if (sklearn_utils.multiclass.type_of_target(y_train) == 'continuous'): raise_error('0718', 'label_col') classifier = DecisionTreeClassifier( criterion, splitter, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, random_state, max_leaf_nodes, min_impurity_decrease, min_impurity_split, class_weight, presort) classifier.fit(table[feature_cols], table[label_col], sample_weight, check_input, X_idx_sorted) try: from sklearn.externals.six import StringIO from sklearn.tree import export_graphviz import pydotplus dot_data = StringIO() export_graphviz(classifier, out_file=dot_data, feature_names=feature_cols, class_names=table[label_col].astype('str').unique(), filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) from brightics.common.repr import png2MD fig_tree = png2MD(graph.create_png()) except: fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer." # json model = _model_dict('decision_tree_classification_model') model['feature_cols'] = feature_cols model['label_col'] = label_col model['classes'] = classifier.classes_ feature_importance = classifier.feature_importances_ model['feature_importance'] = feature_importance model['max_features'] = classifier.max_features_ model['n_classes'] = classifier.n_classes_ model['n_features'] = classifier.n_features_ model['n_outputs'] = classifier.n_outputs_ model['tree'] = classifier.tree_ get_param = classifier.get_params() model['parameters'] = get_param model['classifier'] = classifier # report indices = np.argsort(feature_importance) sorted_feature_cols = np.array(feature_cols)[indices] plt.title('Feature Importances') plt.barh(range(len(indices)), feature_importance[indices], color='b', align='center') for i, v in enumerate(feature_importance[indices]): plt.text(v, i, " {:.2f}".format(v), color='b', va='center', fontweight='bold') plt.yticks(range(len(indices)), sorted_feature_cols) plt.xlabel('Relative Importance') plt.xlim(0, 1.1) plt.tight_layout() fig_feature_importances = plt2MD(plt) plt.clf() params = dict2MD(get_param) # Add tree plot rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Decision Tree Classification Train Result | ### Decision Tree | {fig_tree} | | ### Feature Importance | {fig_feature_importances} | | ### Parameters | {list_parameters} | """.format(fig_tree=fig_tree, fig_feature_importances=fig_feature_importances, list_parameters=params))) model['_repr_brtc_'] = rb.get() return {'model': model}
def _xgb_regression_train(table, feature_cols, label_col, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objectibe='reg:linear', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None, sample_weight_eval_set=None): validate(greater_than_or_equal_to(max_depth, 1, 'max_depth'), greater_than_or_equal_to(learning_rate, 0.0, 'learning_rate'), greater_than_or_equal_to(n_estimators, 1, 'n_estimators')) regressor = XGBRegressor(max_depth, learning_rate, n_estimators, silent, objectibe, booster, n_jobs, nthread, gamma, min_child_weight, max_delta_step, subsample, colsample_bytree, colsample_bylevel, reg_alpha, reg_lambda, scale_pos_weight, base_score, random_state, seed, missing) regressor.fit(table[feature_cols], table[label_col], sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set) # json get_param = regressor.get_params() feature_importance = regressor.feature_importances_ # plt.rcdefaults() plot_importance(regressor) plt.tight_layout() fig_plot_importance = plt2MD(plt) plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(regressor) # fig_plot_tree_UT = plt2MD(plt) # plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(regressor, rankdir='LR') # fig_plot_tree_LR = plt2MD(plt) # plt.rcdefaults() # plt.clf() out_model = _model_dict('xgb_regression_model') out_model['feature_cols'] = feature_cols out_model['label_col'] = label_col out_model['parameters'] = get_param out_model['feature_importance'] = feature_importance out_model['regressor'] = regressor out_model['plot_importance'] = fig_plot_importance # out_model['plot_tree_UT'] = fig_plot_tree_UT # out_model['plot_tree_LR'] = fig_plot_tree_LR # out_model['to_graphviz'] = md_to_graphviz # report get_param_list = [] get_param_list.append(['feature_cols', feature_cols]) get_param_list.append(['label_col', label_col]) for key, value in get_param.items(): temp = [key, value] get_param_list.append(temp) get_param_df = pd.DataFrame(data=get_param_list, columns=['parameter', 'value']) feature_importance_df = pd.DataFrame(data=feature_importance, index=feature_cols).T rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## XGB Regression Result | | ### Plot Importance | {image_importance} | | ### Feature Importance | {table_feature_importance} | | ### Parameters | {table_parameter} | """.format(image_importance=fig_plot_importance, table_feature_importance=pandasDF2MD(feature_importance_df, 20), table_parameter=pandasDF2MD(get_param_df)))) out_model['_repr_brtc_'] = rb.get() return {'model': out_model}
def _oneway_anova(table, response_cols, factor_col): rb = BrtcReprBuilder() rb.addMD( strip_margin(""" ## One-way Analysis of Variance Result """)) groups = table[factor_col].unique() groups.sort() sum_len = np.sum([len(str(group)) for group in groups]) result = dict() result['_grouped_data'] = dict() for response_col in response_cols: data = table[response_col] result['_grouped_data'][response_col] = dict() ax = sns.boxplot(x=factor_col, y=response_col, data=table, order=groups) if sum_len > 512: ax.set_xticklabels(ax.get_xticklabels(), rotation=90) elif sum_len > 64: ax.set_xticklabels(ax.get_xticklabels(), rotation=45) fig_box = plt2MD(plt) plt.clf() model = ols( """Q('{response_col}') ~ C(Q('{factor_col}'))""".format( response_col=response_col, factor_col=factor_col), table).fit() # TODO factor_col = class => error anova = anova_lm(model) index_list = anova.index.tolist() remove_list = ["C(Q('", "'))", "Q('", "')"] for v in remove_list: index_list = [i.replace(v, "") for i in index_list] anova.insert(0, '', index_list) anova_df = pandasDF2MD(anova) p_value = anova["""PR(>F)"""][0] residual = model.resid sns.distplot(residual) distplot = plt2MD(plt) plt.clf() sm.qqplot(residual, line='s') qqplot = plt2MD(plt) plt.clf() rb.addMD( strip_margin(""" | ## {response_col} by {factor_col} | {fig_box} | | ### ANOVA | {anova_df} | | ### Diagnostics | {distplot} | | {qqplot} """.format(response_col=response_col, factor_col=factor_col, fig_box=fig_box, anova_df=anova_df, distplot=distplot, qqplot=qqplot))) result['_grouped_data'][response_col]['p_value'] = p_value result['_repr_brtc_'] = rb.get() return {'result': result}
def _naive_bayes_train(table, feature_cols, label_col, alpha=1.0, fit_prior=True, class_prior=None): features = table[feature_cols] label = table[label_col] label_encoder = preprocessing.LabelEncoder() label_encoder.fit(label) label_correspond = label_encoder.transform(label) if class_prior is not None: tmp_class_prior = [0] * len(class_prior) for elems in class_prior: tmp = elems.split(":") tmp_class_prior[label_encoder.transform([tmp[0]])[0]] = float(tmp[1]) class_prior = tmp_class_prior nb_model = MultinomialNB(alpha, fit_prior, class_prior) nb_model.fit(features, label_correspond) class_log_prior = nb_model.class_log_prior_ feature_log_prob_ = nb_model.feature_log_prob_ tmp_result = np.hstack((list(map(list, zip(*[label_encoder.classes_] + [class_log_prior]))), (feature_log_prob_))) column_names = ['labels', 'pi'] for feature_col in feature_cols: column_names += ['theta_' + feature_col] result_table = pd.DataFrame.from_records(tmp_result, columns=column_names) prediction_correspond = nb_model.predict(features) get_param = dict() get_param['Lambda'] = alpha # get_param['Prior Probabilities of the Classes'] = class_prior get_param['Fit Class Prior Probability'] = fit_prior get_param['Feature Columns'] = feature_cols get_param['Label Column'] = label_col cnf_matrix = confusion_matrix(label_correspond, prediction_correspond) plt.figure() _plot_confusion_matrix(cnf_matrix, classes=label_encoder.classes_, title='Confusion Matrix') fig_confusion_matrix = plt2MD(plt) accuracy = nb_model.score(features, label_correspond) * 100 rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Naive Bayes Classification Result | | ### Model:Multinomial | {result_table} | ### Parameters | {table_parameter} | ### Predicted vs Actual | {image1} | #### Accuacy = {accuracy}% | """.format(image1=fig_confusion_matrix, accuracy=accuracy, result_table=pandasDF2MD(result_table), table_parameter=dict2MD(get_param)))) model = _model_dict('naive_bayes_model') model['features'] = feature_cols model['label_col'] = label_col model['label_encoder'] = label_encoder model['nb_model'] = nb_model model['_repr_brtc_'] = rb.get() return {'model' : model}