def _random_forest_classification_train(table, feature_cols, label_col, n_estimators=10, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0, max_features="sqrt", max_leaf_nodes=None, min_impurity_decrease=0, class_weight=None, random_state=None): feature_names, features_train = check_col_type(table, feature_cols) # X_train = table[feature_cols] y_train = table[label_col] if(type_of_target(y_train) == 'continuous'): raise_error('0718', 'label_col') if max_features == "n": max_features = None class_labels = y_train.unique() if class_weight is not None: if len(class_weight) != len(class_labels): raise ValueError("Number of class weights should match number of labels.") else: classes = sorted(class_labels) class_weight = {classes[i] : class_weight[i] for i in range(len(classes))} classifier = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, class_weight=class_weight, random_state=random_state) classifier.fit(features_train, y_train) params = {'feature_cols': feature_cols, 'label_col': label_col, 'n_estimators': n_estimators, 'criterion': criterion, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_features': max_features, 'max_leaf_nodes': max_leaf_nodes, 'min_impurity_decrease': min_impurity_decrease, 'class_weight': class_weight, 'random_state': random_state} model = _model_dict('random_forest_classification_model') model['classifier'] = classifier model['params'] = params fig_feature_importances = _plot_feature_importances(feature_names, classifier) rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Random Forest Classification Train Result | | ### Parameters | {params} | | ### Feature Importance | {fig_feature_importances} | """.format(params=dict2MD(params), fig_feature_importances=fig_feature_importances))) model['_repr_brtc_'] = rb.get() feature_importance = classifier.feature_importances_ feature_importance_table = pd.DataFrame([[feature_cols[i], feature_importance[i]] for i in range(len(feature_cols))], columns=['feature_name', 'importance']) model['feature_importance_table'] = feature_importance_table return {'model' : model}
def _cross_table(table, input_cols_1, input_cols_2, result='N', margins=False): df1 = [table[col] for col in input_cols_1] df2 = [table[col] for col in input_cols_2] # cross table if result == 'N': result_table = pd.crosstab(df1, df2, margins=margins) elif result == 'N / Row Total': result_table = pd.crosstab(df1, df2, margins=margins, normalize='index') elif result == 'N / Column Total': result_table = pd.crosstab(df1, df2, margins=margins, normalize='columns') elif result == 'N / Total': result_table = pd.crosstab(df1, df2, margins=margins, normalize='all') else: raise_runtime_error("Please check 'result'.") # each row and column name row_names = list(result_table.index)[:] if len(input_cols_1) == 1: joined_row_name = [str(i) for i in row_names] else: if margins == False: joined_row_name = [ '_'.join(str(s) for s in row_names[i]) for i in range(len(row_names)) ] elif margins == True: joined_row_name = [ '_'.join(str(s) for s in row_names[i]) for i in range(len(row_names) - 1) ] + [row_names[-1][0]] column_names = list(result_table.columns)[:] if len(input_cols_2) == 1: joined_column_name = [str(i) for i in column_names] else: if margins == False: joined_column_name = [ '_'.join(str(s) for s in column_names[i]) for i in range(len(column_names)) ] elif margins == True: joined_column_name = [ '_'.join(str(s) for s in column_names[i]) for i in range(len(column_names) - 1) ] + [column_names[-1][0]] # cross table if result == 'N': result_table.insert(loc=0, column=' ', value=joined_row_name) result_table.columns = np.append('N', joined_column_name) # cross table normalize by row elif result == 'N / Row Total': result_table.insert(loc=0, column=' ', value=joined_row_name) result_table.columns = np.append('N / Row Total', joined_column_name) # cross table normalize by column elif result == 'N / Column Total': result_table.insert(loc=0, column=' ', value=joined_row_name) result_table.columns = np.append('N / Column Total', joined_column_name) # cross table normalize by all values elif result == 'N / Total': result_table.insert(loc=0, column=' ', value=joined_row_name) result_table.columns = np.append('N / Total', joined_column_name) else: raise_runtime_error("Please check 'result'.") rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Cross Table Result | ### Result Type : {result} | | #### Result Table | | {result_table} | """.format(result=result, result_table=pandasDF2MD(result_table, num_rows=len(result_table.index) + 1)))) model = _model_dict('cross_table') model['result'] = result model['result_table'] = result_table model['_repr_brtc_'] = rb.get() return {'model': model}
def _decision_tree_classification_train( table, feature_cols, label_col, # fig_size=np.array([6.4, 4.8]), criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=False, sample_weight=None, check_input=True, X_idx_sorted=None): param_validation_check = [ greater_than_or_equal_to(min_samples_split, 2, 'min_samples_split'), greater_than_or_equal_to(min_samples_leaf, 1, 'min_samples_leaf'), greater_than_or_equal_to(min_weight_fraction_leaf, 0.0, 'min_weight_fraction_leaf') ] if max_depth is not None: param_validation_check.append( greater_than_or_equal_to(max_depth, 1, 'max_depth')) validate(*param_validation_check) classifier = DecisionTreeClassifier( criterion, splitter, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, random_state, max_leaf_nodes, min_impurity_decrease, min_impurity_split, class_weight, presort) classifier.fit(table[feature_cols], table[label_col], sample_weight, check_input, X_idx_sorted) try: from sklearn.externals.six import StringIO from sklearn.tree import export_graphviz import pydotplus dot_data = StringIO() export_graphviz(classifier, out_file=dot_data, feature_names=feature_cols, class_names=table[label_col].astype('str').unique(), filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) from brightics.common.repr import png2MD fig_tree = png2MD(graph.create_png()) except: fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer." # json model = _model_dict('decision_tree_classification_model') model['feature_cols'] = feature_cols model['label_col'] = label_col model['classes'] = classifier.classes_ feature_importance = classifier.feature_importances_ model['feature_importance'] = feature_importance model['max_features'] = classifier.max_features_ model['n_classes'] = classifier.n_classes_ model['n_features'] = classifier.n_features_ model['n_outputs'] = classifier.n_outputs_ model['tree'] = classifier.tree_ get_param = classifier.get_params() model['parameters'] = get_param model['classifier'] = classifier # report indices = np.argsort(feature_importance) sorted_feature_cols = np.array(feature_cols)[indices] plt.title('Feature Importances') plt.barh(range(len(indices)), feature_importance[indices], color='b', align='center') for i, v in enumerate(feature_importance[indices]): plt.text(v, i, " {:.2f}".format(v), color='b', va='center', fontweight='bold') plt.yticks(range(len(indices)), sorted_feature_cols) plt.xlabel('Relative Importance') plt.xlim(0, 1.1) plt.tight_layout() fig_feature_importances = plt2MD(plt) plt.clf() params = dict2MD(get_param) feature_importance_df = pd.DataFrame(data=feature_importance, index=feature_cols).T # Add tree plot rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Decision Tree Classification Train Result | ### Decision Tree | {fig_tree} | | ### Feature Importance | {fig_feature_importances} | | ### Parameters | {list_parameters} | """.format(fig_tree=fig_tree, fig_feature_importances=fig_feature_importances, list_parameters=params))) model['_repr_brtc_'] = rb.get() return {'model': model}
def _mlp_regression_train(table, feature_cols, label_col, hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001, batch_size_auto=True, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, max_iter=200, random_state=None, tol=0.0001): _, features = check_col_type(table, feature_cols) label = table[label_col] mlp_model = MLPRegressor(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, alpha=alpha, batch_size=batch_size, learning_rate=learning_rate, learning_rate_init=learning_rate_init, max_iter=max_iter, shuffle=True, random_state=random_state, tol=tol) mlp_model.fit(features, label) predict = mlp_model.predict(features) intercepts = mlp_model.intercepts_ coefficients = mlp_model.coefs_ loss = mlp_model.loss_ _mean_absolute_error = mean_absolute_error(label, predict) _mean_squared_error = mean_squared_error(label, predict) _r2_score = r2_score(label, predict) result_table = pd.DataFrame.from_items( [['Metric', ['Mean Absolute Error', 'Mean Squared Error', 'R2 Score']], ['Score', [_mean_absolute_error, _mean_squared_error, _r2_score]]]) label_name = { 'hidden_layer_sizes': 'Hidden Layer Sizes', 'activation': 'Activation Function', 'solver': 'Solver', 'alpha': 'Alpha', 'batch_size': 'Batch Size', 'learning_rate': 'Learning Rate', 'learning_rate_init': 'Learning Rate Initial', 'max_iter': 'Max Iteration', 'random_state': 'Seed', 'tol': 'Tolerance' } get_param = mlp_model.get_params() param_table = pd.DataFrame.from_items( [['Parameter', list(label_name.values())], ['Value', [get_param[x] for x in list(label_name.keys())]]]) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ### MLP Classification Result | {result} | ### Parameters | {list_parameters} """.format(result=pandasDF2MD(result_table), list_parameters=pandasDF2MD(param_table)))) model = _model_dict('mlp_regression_model') model['features'] = feature_cols model['label'] = label_col model['intercepts'] = mlp_model.intercepts_ model['coefficients'] = mlp_model.coefs_ model['loss'] = mlp_model.loss_ model['mean_absolute_error'] = _mean_absolute_error model['mean_squared_error'] = _mean_squared_error model['r2_score'] = _r2_score model['activation'] = activation model['solver'] = solver model['alpha'] = alpha model['batch_size'] = batch_size model['learning_rate'] = learning_rate model['learning_rate_init'] = learning_rate_init model['max_iter'] = max_iter model['random_state'] = random_state model['tol'] = tol model['mlp_model'] = mlp_model model['_repr_brtc_'] = rb.get() return {'model': model}
def _linear_regression_train(table, feature_cols, label_col, fit_intercept=True, is_vif=False, vif_threshold=10): features = table[feature_cols] label = table[label_col] if fit_intercept == True: features = sm.add_constant(features, has_constant='add') lr_model_fit = sm.OLS(label, features).fit() else: lr_model_fit = sm.OLS(label, features).fit() predict = lr_model_fit.predict(features) residual = label - predict summary = lr_model_fit.summary() summary_tables = simple_tables2df_list(summary.tables, drop_index=True) summary0 = summary_tables[0] summary1 = summary_tables[1] if is_vif: summary1['VIF'] = [ variance_inflation_factor(features.values, i) for i in range(features.shape[1]) ] summary1['VIF>{}'.format(vif_threshold)] = summary1['VIF'].apply( lambda _: 'true' if _ > vif_threshold else 'false') summary.tables[1] = _df_to_simpletable(summary1) summary2 = summary_tables[2] html_result = summary.as_html() plt.figure() plt.scatter(predict, label) plt.xlabel('Predicted values for ' + label_col) plt.ylabel('Actual values for ' + label_col) x = predict p1x = np.min(x) p2x = np.max(x) plt.plot([p1x, p2x], [p1x, p2x], 'r--') fig_actual_predict = plt2MD(plt) plt.figure() plt.scatter(predict, residual) plt.xlabel('Predicted values for ' + label_col) plt.ylabel('Residuals') plt.axhline(y=0, color='r', linestyle='--') fig_residual_1 = plt2MD(plt) plt.figure() sm.qqplot(residual, line='s') plt.ylabel('Residuals') fig_residual_2 = plt2MD(plt) plt.figure() sns.distplot(residual) plt.xlabel('Residuals') fig_residual_3 = plt2MD(plt) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Linear Regression Result | ### Summary | """)) rb.addHTML(html_result) rb.addMD( strip_margin(""" | | ### Predicted vs Actual | {image1} | | ### Fit Diagnostics | {image2} | {image3} | {image4} """.format(image1=fig_actual_predict, image2=fig_residual_1, image3=fig_residual_2, image4=fig_residual_3))) model = _model_dict('linear_regression_model') model['features'] = feature_cols model['label'] = label_col model['coefficients'] = lr_model_fit.params model['fit_intercept'] = fit_intercept model['r2'] = lr_model_fit.rsquared model['adjusted_r2'] = lr_model_fit.rsquared_adj model['aic'] = lr_model_fit.aic model['bic'] = lr_model_fit.bic model['f_static'] = lr_model_fit.fvalue model['tvalues'] = lr_model_fit.tvalues model['pvalues'] = lr_model_fit.pvalues model['_repr_brtc_'] = rb.get() model['summary0'] = summary0 model['summary1'] = summary1 model['summary2'] = summary2 lr_model_fit.remove_data() model['lr_model'] = lr_model_fit return {'model': model}
def _autocorrelation(table, input_col, nlags=20, conf_level=0.95): data = table[input_col] plt.figure() plot_acf(data, lags=nlags, alpha=1 - conf_level) fig_plt_acf = plt2MD(plt) plt.clf() plt.figure() plot_pacf(data, lags=nlags, alpha=1 - conf_level) fig_plt_pacf = plt2MD(plt) plt.clf() acf_ret = acf(data, nlags=nlags, alpha=1 - conf_level) pacf_ret = pacf(data, nlags=nlags, alpha=1 - conf_level) result_table1 = pd.DataFrame([]) result_table1['lag'] = list(range(nlags + 1)) result_table1['ACF'] = acf_ret[0] if conf_level is not None: result_table1['%g%% confidence Interval' % (conf_level * 100)] = [ str((acf_ret[1][i][0], acf_ret[1][i][1])) for i in range(nlags + 1) ] result_table2 = pd.DataFrame([]) result_table2['lag'] = list(range(nlags + 1)) result_table2['PACF'] = pacf_ret[0] if conf_level is not None: result_table2['%g%% confidence Interval' % (conf_level * 100)] = [ str((pacf_ret[1][i][0], pacf_ret[1][i][1])) for i in range(nlags + 1) ] rb = BrtcReprBuilder() rb.addMD( strip_margin("""# Autocorrelation / Partial Autocorrelation Result""")) rb.addMD( strip_margin(""" |## Autocorrelation | |{image1} | |### Autocorrelation Table | |{result_table1} | |## Partial Autocorrelation | |{image2} | |### Partial Autocorrelation Table | |{result_table2} | """.format(image1=fig_plt_acf, result_table1=pandasDF2MD(result_table1, num_rows=nlags + 1), image2=fig_plt_pacf, result_table2=pandasDF2MD(result_table2, num_rows=nlags + 1)))) model = _model_dict('autocorrelation') model['autocorrelation_table'] = result_table1 model['partial_autocorrelation_table'] = result_table2 model['_repr_brtc_'] = rb.get() return {'model': model}
def _one_hot_encoder(table, input_cols, prefix='list', prefix_list=None, suffix='index', n_values='auto', categorical_features='all', sparse=True, handle_unknown='error', drop_last=False): out_table = table.copy() sparse = False enc_list = [] le_list = [] if drop_last: new_col_names_list_with_true_drop_last = [] new_col_names_list = [] prefix_list_index = 0 if prefix == 'list': len_prefix_list = 0 if prefix_list is None else len(prefix_list) if len(input_cols) != len_prefix_list: # TODO: make the error message code raise_runtime_error( 'The number of Input Columns and the number of Prefixes should be equal.' ) for col_name in input_cols: enc = OneHotEncoder(n_values=n_values, categorical_features=categorical_features, sparse=sparse, handle_unknown=handle_unknown) le = LabelEncoder() new_col_names = [] if suffix == 'index': if prefix == 'list': for i in range(0, len(np.unique(out_table[col_name].values))): new_col_names.append(prefix_list[prefix_list_index] + '_' + str(i)) else: for i in range(0, len(np.unique(out_table[col_name].values))): new_col_names.append(col_name + '_' + str(i)) else: if prefix == 'list': for i in np.unique(out_table[col_name].values): new_col_names.append(prefix_list[prefix_list_index] + '_' + str(i)) else: for i in np.unique(out_table[col_name].values): new_col_names.append(col_name + '_' + str(i)) transformed_table = pd.DataFrame(enc.fit_transform( le.fit_transform(out_table[col_name]).reshape(-1, 1)), columns=new_col_names) new_col_names_list.append(new_col_names) if drop_last: new_col_names = new_col_names[:-1] new_col_names_list_with_true_drop_last.append(new_col_names) for new_col_name in new_col_names: out_table[new_col_name] = transformed_table[new_col_name] enc_list.append(enc) le_list.append(le) prefix_list_index = prefix_list_index + 1 out_model = _model_dict('one_hot_encoder') out_model['one_hot_encoder_list'] = enc_list out_model['label_encoder_list'] = le_list out_model['input_cols'] = input_cols out_model['classes'] = le.classes_ out_model['active_features'] = enc.active_features_ out_model['feature_indices'] = enc.feature_indices_ out_model['n_values'] = enc.n_values_ out_model['prefix'] = prefix out_model['prefix_list'] = prefix_list out_model['suffix'] = suffix out_model['drop_last'] = drop_last if drop_last: out_model[ 'new_col_names_list_with_true_drop_last'] = new_col_names_list_with_true_drop_last out_model['new_col_names_list'] = new_col_names_list return {'out_table': out_table, 'model': out_model}
def _chi_square_test_of_independence(table, response_cols, factor_col, correction=False): label_list = [] feature_list = [] alternative_hypothesis_list = [] dof_list = [] stat_chi_list = [] p_chi_list = [] for response_col in response_cols: response = table[response_col] contingency_table = pd.crosstab(table[response_col], table[factor_col], margins=True) response_index = len(contingency_table) - 1 factor_index = len(contingency_table.columns) - 1 temporary = contingency_table.iloc[0:response_index, 0:factor_index] f_object = np.array(temporary) test = stats.chi2_contingency(f_object, correction, 1)[0:3] label = '{factor_col}'.format(factor_col=factor_col) feature = '{response_col}'.format(response_col=response_col) if test[1] < 0.05: dependence = 'Reject the null hypothesis that two categorical variables are independent at 5% significance level.' elif test[1] >= 0.05: dependence = 'No association was found between two categorical variables at 5% significance level.' elif math.isnan(test[1]): dependence = 'Independence of two categorical variables cannot be decided.' conclusion = '{dependence}'.format(dependence=dependence) alternative_hypothesis = 'Two categorical variables are dependent.' dof = 'chi-square distribution with {dof} degrees of freedom'.format( dof=test[2]) stat_chi = '{stat_chi}'.format(stat_chi=test[0]) p_chi = '{p_chi}'.format(p_chi=test[1]) label_list.append(label) feature_list.append(feature) alternative_hypothesis_list.append(alternative_hypothesis) dof_list.append(dof) stat_chi_list.append(stat_chi) p_chi_list.append(p_chi) result_table = pd.DataFrame.from_items( [['label', label_list], ['feature', feature_list], ['alternative_hypothesis', alternative_hypothesis_list], ['df', dof_list], ['estimate', stat_chi_list], ['p_value', p_chi_list]]) result = dict() result['result_table'] = result_table rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Chi-square Test of Independence Result | - H0: the two categorical variables are independent. | - H1: the two categorical variables are dependent. """)) for response_col in response_cols: response = table[response_col] contingency_table = pd.crosstab(table[response_col], table[factor_col], margins=True) response_index = len(contingency_table) - 1 factor_index = len(contingency_table.columns) - 1 temporary = contingency_table.iloc[0:response_index, 0:factor_index] f_object = np.array(temporary) test = stats.chi2_contingency(f_object, correction, 1)[0:3] label = '{factor_col}'.format(factor_col=factor_col) feature = '{response_col}'.format(response_col=response_col) if test[1] < 0.05: dependence = 'Reject the null hypothesis that two categorical variables are independent at 5% significance level.' elif test[1] >= 0.05: dependence = 'No association was found between two categorical variables at 5% significance level.' elif math.isnan(test[1]): dependence = 'Independence of two categorical variables cannot be decided.' dof_simplelist = [] stat_chi_simplelist = [] p_chi_simplelist = [] dof = '{dof}'.format(dof=test[2]) stat_chi = '{stat_chi}'.format(stat_chi=test[0]) p_chi = '{p_chi}'.format(p_chi=test[1]) stat_chi_simplelist.append(stat_chi) dof_simplelist.append(dof) p_chi_simplelist.append(p_chi) result_table_simple = pd.DataFrame.from_items( [['estimate', stat_chi_simplelist], ['df', dof_simplelist], ['p_value', p_chi_simplelist]]) # test statistic = {test_statistic}, df = {dof}, p_value = {p_value} # test_statistic = stats.chi2_contingency(f_object,correction,lambda_)[0], dof=stats.chi2_contingency(f_object,correction,lambda_)[2], p_value=stats.chi2_contingency(f_object,correction,lambda_)[1] rb.addMD( strip_margin(""" |### Label: {label}, Feature: {feature} | |{result_table_simple} | |{dependence} | | """.format(label=factor_col, feature=response_col, result_table_simple=pandasDF2MD(result_table_simple), dependence=dependence))) model = _model_dict('Chi-square test of independence') model['report'] = rb.get() result_table = result_table.copy() return {'model': model}
def _association_rule_visualization(table, option='multiple_to_single', edge_length_scaling=1, font_size=10, node_size_scaling=1, figure_size_muliplier=1, display_rule_num=False): if (option == 'single_to_single'): result_network = table.copy() length_ante = [] string_ante = [] length_conse = [] string_conse = [] for row in result_network['antecedent']: length_ante += [len(row)] string_ante += [row[0]] for row in result_network['consequent']: length_conse += [len(row)] string_conse += [row[0]] result_network['length_ante'] = length_ante result_network['string_ante'] = string_ante result_network['length_conse'] = length_conse result_network['string_conse'] = string_conse result_network = result_network[result_network.length_ante == 1] result_network = result_network[result_network.length_conse == 1] result_network['support_ante'] = result_network[ 'support'] / result_network['confidence'] result_network['support_conse'] = result_network[ 'confidence'] / result_network['lift'] #edges_colors = preprocessing.LabelEncoder() #edges_colors.fit(result_network['lift']) #edges_colors = edges_colors.transform(result_network['lift']) #result_network['edge_colors'] = edges_colors result_network = result_network.reset_index() edges = [] for i in range(len(result_network.string_ante)): edges += [(result_network.string_ante[i], result_network.string_conse[i])] G = nx.DiGraph() G.add_edges_from(edges) nodes = G.nodes() plt.figure(figsize=(4 * len(nodes)**0.5 * figure_size_muliplier, 4 * len(nodes)**0.5 * figure_size_muliplier)) pos = nx.spring_layout(G, k=0.4 * edge_length_scaling) node_tmp = list(result_network.string_ante) + list( result_network.string_conse) support_tmp = list(result_network.support_ante) + list( result_network.support_conse) tmp_node_support = [] for i in range(len(node_tmp)): tmp_node_support += [[node_tmp[i], support_tmp[i]]] nodes_table = pd.DataFrame.from_records(tmp_node_support, columns=['name', 'support']) nodes_table = nodes_table.drop_duplicates(['name']) node_color = [] nodes_table = nodes_table.reset_index() scaled_support = _scaling(nodes_table.support) for node in nodes: for i in range(len(nodes_table.name)): if nodes_table.name[i] == node: node_color += [ scaled_support[i] * 2500 * node_size_scaling ] break #if(scaling==True): # edge_color = [result_network['edge_colors'][n] for n in range(len(result_network['length_conse']))] #else: scaled_support = _scaling(result_network['confidence']) edge_size = [ scaled_support[n] * 8 for n in range(len(result_network['length_conse'])) ] edge_color = [ result_network['lift'][n] for n in range(len(result_network['length_conse'])) ] nx.draw(G, pos, node_color=node_color, edge_color=edge_color, node_size=node_color, arrowsize=20 * (0.2 + 0.8 * node_size_scaling), font_family='NanumGothic', with_labels=True, cmap=plt.cm.Blues, edge_cmap=plt.cm.Reds, arrows=True, edge_size=edge_color, width=edge_size, font_size=font_size) fig_digraph = plt2MD(plt) graph_min_support = np.min(nodes_table.support) graph_max_support = np.max(nodes_table.support) graph_min_confidence = np.min(result_network['confidence']) graph_max_confidence = np.max(result_network['confidence']) graph_min_lift = np.min(result_network['lift']) graph_max_lift = np.max(result_network['lift']) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ### Network Digraph | ##### Node color, size : support ({graph_min_support}~{graph_max_support}) | ##### Edge color : lift ({graph_min_lift}~{graph_max_lift}) | ##### Edge size : confidence ({graph_min_confidence}~{graph_max_confidence}) | {image1} | """.format(image1=fig_digraph, graph_min_support=graph_min_support, graph_max_support=graph_max_support, graph_min_lift=graph_min_lift, graph_max_lift=graph_max_lift, graph_min_confidence=graph_min_confidence, graph_max_confidence=graph_max_confidence))) elif (option == 'multiple_to_single'): result_network = table.copy() length_ante = [] string_ante = [] length_conse = [] string_conse = [] for row in result_network['consequent']: length_conse += [len(row)] string_conse += [row[0]] result_network['length_conse'] = length_conse result_network['consequent'] = string_conse result_network = result_network[result_network.length_conse == 1] index_list = result_network.index.tolist() rownum = [] for i in range(len(result_network['consequent'])): if display_rule_num: rownum += ['R%d' % (i + 1)] else: rownum += [_n_blank_strings(i + 1)] result_network['row_number'] = rownum edges = [] nodes = [] for i in index_list: for j in range(len(result_network.antecedent[i])): edges += [(result_network.antecedent[i][j], result_network['row_number'][i])] edges += [(result_network['row_number'][i], result_network.consequent[i])] nodes += [result_network['row_number'][i]] G = nx.DiGraph() G.add_nodes_from(nodes) G.add_edges_from(edges) plt.figure(figsize=(2 * len(nodes)**0.5 * figure_size_muliplier, 2 * len(nodes)**0.5 * figure_size_muliplier)) pos = nx.spring_layout(G, k=0.2 * edge_length_scaling) nodes_color = [] nodes_size = [] scaled_lift = _scaling(result_network.lift) for node in range(len(G.nodes())): if node < len(nodes): nodes_color += [result_network.support[index_list[node]]] nodes_size += [scaled_lift[node] * 2000 * node_size_scaling] else: nodes_color += [0] nodes_size += [0] nx.draw(G, pos, node_color=nodes_color, node_size=nodes_size, font_family='NanumGothic', with_labels=True, cmap=plt.cm.Reds, arrows=True, edge_color='Grey', font_weight='bold', arrowsize=20 * (0.2 + 0.8 * node_size_scaling), font_size=font_size) fig_digraph = plt2MD(plt) graph_min_support = np.min(result_network.support) graph_max_support = np.max(result_network.support) graph_min_lift = np.min(result_network.lift) graph_max_lift = np.max(result_network.lift) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ### Network Digraph | ##### Size of circle : support ({graph_min_support}~{graph_max_support}) | ##### Color of circle : lift ({graph_min_lift}~{graph_max_lift}) | {image1} | """.format(image1=fig_digraph, graph_min_support=graph_min_support, graph_max_support=graph_max_support, graph_min_lift=graph_min_lift, graph_max_lift=graph_max_lift))) else: result_network = table.copy() length_ante = [] string_ante = [] length_conse = [] string_conse = [] for row in result_network['consequent']: length_conse += [len(row)] result_network['length_conse'] = length_conse result_network = result_network.reset_index() rownum = [] for i in range(len(result_network['consequent'])): if display_rule_num: rownum += ['R%d' % i] else: rownum += [_n_blank_strings(i + 1)] result_network['row_number'] = rownum edges = [] nodes = [] for i in range(len(result_network.consequent)): for j in range(len(result_network.antecedent[i])): edges += [(result_network.antecedent[i][j], result_network['row_number'][i])] for j in range(len(result_network.consequent[i])): edges += [(result_network['row_number'][i], result_network.consequent[i][j])] nodes += [result_network['row_number'][i]] G = nx.DiGraph() G.add_nodes_from(nodes) G.add_edges_from(edges) plt.figure(figsize=(2 * len(nodes)**0.5 * figure_size_muliplier, 2 * len(nodes)**0.5 * figure_size_muliplier)) pos = nx.spring_layout(G, k=0.2 * edge_length_scaling) nodes_color = [] nodes_size = [] scaled_lift = _scaling(result_network.lift) for node in range(len(G.nodes())): if node < len(nodes): nodes_color += [result_network.support[node]] nodes_size += [scaled_lift[node] * 2000 * node_size_scaling] else: nodes_color += [0] nodes_size += [0] nx.draw(G, pos, node_color=nodes_color, node_size=nodes_size, font_family='NanumGothic', with_labels=True, cmap=plt.cm.Reds, arrows=True, edge_color='Grey', font_weight='bold', arrowsize=20 * (0.2 + 0.8 * node_size_scaling), font_size=font_size) fig_digraph = plt2MD(plt) graph_min_support = np.min(result_network.support) graph_max_support = np.max(result_network.support) graph_min_lift = np.min(result_network.lift) graph_max_lift = np.max(result_network.lift) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ### Network Digraph | ##### Size of circle : support ({graph_min_support}~{graph_max_support}) | ##### Color of circle : lift ({graph_min_lift}~{graph_max_lift}) | {image1} | """.format(image1=fig_digraph, graph_min_support=graph_min_support, graph_max_support=graph_max_support, graph_min_lift=graph_min_lift, graph_max_lift=graph_max_lift))) model = _model_dict('Association rule') model['_repr_brtc_'] = rb.get() return {'model': model}
def naive_bayes_train(table, feature_cols, label_col, alpha=1.0, fit_prior=True, class_prior=None): features = table[feature_cols] label = table[label_col] label_encoder = preprocessing.LabelEncoder() label_encoder.fit(label) label_correspond = label_encoder.transform(label) if class_prior is not None: tmp_class_prior = [0 for x in range(len(class_prior))] for elems in class_prior: tmp = elems.split(":") tmp_class_prior[label_encoder.transform([tmp[0] ])[0]] = float(tmp[1]) class_prior = tmp_class_prior nb_model = MultinomialNB(alpha, fit_prior, class_prior) nb_model.fit(features, label_correspond) prediction_correspond = nb_model.predict(features) get_param = dict() get_param['Lambda'] = alpha # get_param['Prior Probabilities of the Classes'] = class_prior # get_param['Fit Class Prior Probability'] = fit_prior get_param['Feature Columns'] = feature_cols get_param['Label Column'] = label_col cnf_matrix = confusion_matrix(label_correspond, prediction_correspond) plt.figure() plot_confusion_matrix(cnf_matrix, classes=label_encoder.classes_, title='Confusion Matrix') fig_confusion_matrix = plt2MD(plt) accuracy = nb_model.score(features, label_correspond) * 100 rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Naive Bayes Classification Result | ### Parameters | {table_parameter} | ### Predicted vs Actual | {image1} | #### Accuacy = {accuracy}% | """.format(image1=fig_confusion_matrix, accuracy=accuracy, table_parameter=dict2MD(get_param)))) model = _model_dict('naive_bayes_model') model['features'] = feature_cols model['label_col'] = label_col model['label_encoder'] = label_encoder model['nb_model'] = nb_model model['report'] = rb.get() return {'model': model}
def _mlp_classification_train(table, feature_cols, label_col, hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001, batch_size_auto=True, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, max_iter=200, random_state=None, tol=0.0001): feature_names, features = check_col_type(table,feature_cols) label = table[label_col] if(sklearn_utils.multiclass.type_of_target(label) == 'continuous'): raise_error('0718', 'label_col') mlp_model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, alpha=alpha, batch_size=batch_size, learning_rate=learning_rate, learning_rate_init=learning_rate_init, max_iter=max_iter, shuffle=True, random_state=random_state, tol=tol) mlp_model.fit(features, label) predict = mlp_model.predict(features) intercepts = mlp_model.intercepts_ coefficients = mlp_model.coefs_ classes = mlp_model.classes_ # is_binary = len(classes) == 2 loss = mlp_model.loss_ _accuracy_score = accuracy_score(label, predict) _f1_score = f1_score(label, predict, average='micro') _precision_score = precision_score(label, predict, average='micro') _recall_score = recall_score(label, predict, average='micro') # summary = pd.DataFrame({'features': feature_names}) # coef_trans = np.transpose(coefficients) # summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=classes)), axis=1) result_table = pd.DataFrame.from_items([ ['Metric', ['Accuracy Score', 'F1 Score', 'Precision Score', 'Recall Score']], ['Score', [_accuracy_score, _f1_score, _precision_score, _recall_score]] ]) label_name = { 'hidden_layer_sizes': 'Hidden Layer Sizes', 'activation': 'Activation Function', 'solver': 'Solver', 'alpha': 'Alpha', 'batch_size': 'Batch Size', 'learning_rate': 'Learning Rate', 'learning_rate_init': 'Learning Rate Initial', 'max_iter': 'Max Iteration', 'random_state': 'Seed', 'tol': 'Tolerance'} get_param = mlp_model.get_params() param_table = pd.DataFrame.from_items([ ['Parameter', list(label_name.values())], ['Value', [get_param[x] for x in list(label_name.keys())]] ]) rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ### MLP Classification Result | {result} | ### Parameters | {list_parameters} """.format(result=pandasDF2MD(result_table), list_parameters=pandasDF2MD(param_table) ))) model = _model_dict('mlp_classification_model') model['features'] = feature_cols model['label'] = label_col model['intercepts'] = mlp_model.intercepts_ model['coefficients'] = mlp_model.coefs_ model['class'] = mlp_model.classes_ model['loss'] = mlp_model.loss_ model['accuracy_score'] = _accuracy_score model['f1_score'] = _f1_score model['precision_score'] = _precision_score model['recall_score'] = _recall_score model['activation'] = activation model['solver'] = solver model['alpha'] = alpha model['batch_size'] = batch_size model['learning_rate'] = learning_rate model['learning_rate_init'] = learning_rate_init model['max_iter'] = max_iter model['random_state'] = random_state model['tol'] = tol model['mlp_model'] = mlp_model model['_repr_brtc_'] = rb.get() # model['summary'] = summary return {'model' : model}
def _lda(table, input_col, num_voca=1000, num_topic=3, num_topic_word=3, max_iter=20, learning_method='online', learning_offset=10., random_state=None): corpus = table[input_col] tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_voca, stop_words='english') term_count = tf_vectorizer.fit_transform(corpus) tf_feature_names = tf_vectorizer.get_feature_names() if learning_method == 'online': lda_model = LatentDirichletAllocation( n_components=num_topic, max_iter=max_iter, learning_method=learning_method, learning_offset=learning_offset, random_state=random_state).fit(term_count) elif learning_method == 'batch': lda_model = LatentDirichletAllocation( n_components=num_topic, max_iter=max_iter, learning_method=learning_method, random_state=random_state).fit(term_count) else: raise_runtime_error("Please check 'learning_method'.") topic_model = pd.DataFrame([]) topic_idx_list = [] voca_weights_list = [] for topic_idx, weights in enumerate(lda_model.components_): topic_idx_list.append("Topic {}".format(topic_idx)) pairs = [] for term_idx, value in enumerate(weights): pairs.append((abs(value), tf_feature_names[term_idx])) pairs.sort(key=lambda x: x[0], reverse=True) voca_weights = [] for pair in pairs[:num_topic_word]: voca_weights.append("{}: {}".format(pair[1], pair[0])) voca_weights_list.append(voca_weights) topic_model['topic idx'] = topic_idx_list topic_model['topic vocabularies'] = voca_weights_list doc_topic = lda_model.transform(term_count) doc_classification = pd.DataFrame() doc_classification['documents'] = [doc for doc in corpus] doc_classification['top topic'] = [ "Topic {}".format(doc_topic[i].argmax()) for i in range(len(corpus)) ] params = { 'Input Column': input_col, 'Number of Vocabularies': num_voca, 'Number of Topics': num_topic, 'Number of Terminologies': num_topic_word, 'Iterations': max_iter, 'Learning Method': learning_method, } rb = BrtcReprBuilder() rb.addMD(strip_margin("""# Latent Dirichlet Allocation Result""")) rb.addMD( strip_margin(""" | |### Parameters | | {display_params} | |### Topic Model | |{topic_model} | |### Documents Classification | |{doc_classification} | """.format(display_params=dict2MD(params), topic_model=pandasDF2MD(topic_model, num_rows=num_topic + 1), doc_classification=pandasDF2MD(doc_classification, num_rows=len(corpus) + 1)))) model = _model_dict('lda') model['parameter'] = params model['topic_model'] = topic_model model['documents_classification'] = doc_classification model['_repr_brtc_'] = rb.get() return {'model': model}
def _xgb_regression_train(table, feature_cols, label_col, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objectibe='reg:linear', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None, sample_weight_eval_set=None): regressor = XGBRegressor(max_depth, learning_rate, n_estimators, silent, objectibe, booster, n_jobs, nthread, gamma, min_child_weight, max_delta_step, subsample, colsample_bytree, colsample_bylevel, reg_alpha, reg_lambda, scale_pos_weight, base_score, random_state, seed, missing) regressor.fit(table[feature_cols], table[label_col], sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set) # json get_param = regressor.get_params() feature_importance = regressor.feature_importances_ # plt.rcdefaults() plot_importance(regressor) plt.tight_layout() fig_plot_importance = plt2MD(plt) plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(regressor) # fig_plot_tree_UT = plt2MD(plt) # plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(regressor, rankdir='LR') # fig_plot_tree_LR = plt2MD(plt) # plt.rcdefaults() # plt.clf() out_model = _model_dict('xgb_regression_model') out_model['feature_cols'] = feature_cols out_model['label_col'] = label_col out_model['parameters'] = get_param out_model['feature_importance'] = feature_importance out_model['regressor'] = regressor out_model['plot_importance'] = fig_plot_importance # out_model['plot_tree_UT'] = fig_plot_tree_UT # out_model['plot_tree_LR'] = fig_plot_tree_LR # out_model['to_graphviz'] = md_to_graphviz # report get_param_list = [] get_param_list.append(['feature_cols', feature_cols]) get_param_list.append(['label_col', label_col]) for key, value in get_param.items(): temp = [key, value] get_param_list.append(temp) get_param_df = pd.DataFrame(data=get_param_list, columns=['parameter', 'value']) feature_importance_df = pd.DataFrame(data=feature_importance, index=feature_cols).T rb = ReportBuilder() rb.addMD( strip_margin(""" | ## XGB Regression Result | | ### Plot Importance | {image_importance} | | ### Feature Importance | {table_feature_importance} | | ### Parameters | {table_parameter} | """.format(image_importance=fig_plot_importance, table_feature_importance=pandasDF2MD(feature_importance_df, 20), table_parameter=pandasDF2MD(get_param_df)))) out_model['report'] = rb.get() return {'model': out_model}
def _pca(table, input_cols, new_column_name='projected_', n_components=None, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None): num_feature_cols = len(input_cols) if n_components is None: n_components = num_feature_cols pca = PCA(n_components, copy, whiten, svd_solver, tol, iterated_power, random_state) pca_model = pca.fit(table[input_cols]) column_names = [] for i in range(0, n_components): column_names.append(new_column_name + str(i)) # print(column_names) pca_result = pca_model.transform(table[input_cols]) out_df = pd.DataFrame(data=pca_result, columns=[column_names]) res_components = pca_model.components_ res_components_df = pd.DataFrame(data=res_components, columns=[input_cols]) res_explained_variance = pca_model.explained_variance_ res_explained_variance_ratio = pca_model.explained_variance_ratio_ res_singular_values = pca_model.singular_values_ res_mean = pca_model.mean_ res_n_components = pca_model.n_components_ res_noise_variance = pca_model.noise_variance_ res_get_param = pca_model.get_params() res_get_covariance = pca_model.get_covariance() res_get_precision = pca_model.get_precision() # visualization plt.figure() if res_n_components == 1: plt.scatter(pca_result[:, 0], pca_result[:, 0]) else: plt.scatter(pca_result[:, 0], pca_result[:, 1]) # plt.title('PCA result with two components') # plt.show() plt_two = plt2MD(plt) plt.clf() rb = ReportBuilder() rb.addMD( strip_margin(""" | | ### Plot | The x-axis and y-axis of the following plot is projected0 and projected1, respectively. | {image1} | | ### Result | {table1} | only showing top 20 rows | | ### Parameters | {parameter1} | | ### Components | {table2} | | ### Mean | {array1} | | ### Explained Variance | {array2} | """.format(table1=pandasDF2MD(out_df, 20), image1=plt_two, parameter1=dict2MD(res_get_param), table2=pandasDF2MD(res_components_df), array1=res_mean, array2=res_explained_variance))) model = _model_dict('pca') model['components'] = res_components model['explained_variance'] = res_explained_variance model['explained_variance_ratio'] = res_explained_variance_ratio model['singular_values'] = res_singular_values model['mean'] = res_mean model['n_components'] = res_n_components model['noise_variance'] = res_noise_variance model['parameters'] = res_get_param model['covariance'] = res_get_covariance model['precision'] = res_get_precision model['report'] = rb.get() model['pca_model'] = pca_model model['input_cols'] = input_cols out_df = pd.concat([table.reset_index(drop=True), out_df], axis=1) out_df.columns = table.columns.values.tolist() + column_names return {'out_table': out_df, 'model': model}
def _collaborative_filtering_train(table, user_col, item_col, rating_col, N=10, filter=True, k=5, based='item', mode='train', method='cosine', weighted=True, centered=True, targets=None, normalize=True, workers=1, filter_minus=False, maintain_already_scored=True): if based == 'item': normalize = False table_user_col = table[user_col] table_item_col = table[item_col] rating_col = table[rating_col] user_encoder = preprocessing.LabelEncoder() item_encoder = preprocessing.LabelEncoder() user_encoder.fit(table_user_col) item_encoder.fit(table_item_col) user_correspond = user_encoder.transform(table_user_col) item_correspond = item_encoder.transform(table_item_col) if based == 'item': item_users = csr_matrix( (rating_col, (item_correspond, user_correspond))) check_cen = csr_matrix( (rating_col + 1, (item_correspond, user_correspond))) else: item_users = csr_matrix( (rating_col, (user_correspond, item_correspond))) check_cen = csr_matrix( (rating_col + 1, (user_correspond, item_correspond))) centered_ratings = item_users.copy() num_item, num_user = item_users.shape if centered: update_item = [] update_user = [] update_rating = [] for item in range(num_item): index = 0 sum = 0 for user, rating in _nonzeros(check_cen, item): index += 1 sum += rating avg = sum / index - 1 for user, rating in _nonzeros(check_cen, item): update_item.append(item) update_user.append(user) update_rating.append(avg) centered_ratings -= csr_matrix( (update_rating, (update_item, update_user))) if (method == 'adjusted' or normalize) and based == 'item': check_cen = check_cen.transpose().tocsr() if based == 'user': tmp = num_user num_user = num_item num_item = tmp user_avg = [] if normalize: for user in range(num_user): index = 0 sum = 0 for user, rating in _nonzeros(check_cen, user): index += 1 sum += rating avg = sum / index user_avg.append(avg) if method == 'adjusted': update_item = [] update_user = [] update_rating = [] for user in range(num_user): sum = 0 for item, rating in _nonzeros(check_cen, user): sum += rating avg = sum / num_item for item in range(num_item): update_item.append(item) update_user.append(user) update_rating.append(avg) if based == 'item': centered_ratings -= csr_matrix( (update_rating, (update_item, update_user))) else: centered_ratings -= csr_matrix( (update_rating, (update_user, update_item))) method = 'cosine' if based == 'user': tmp = num_user num_user = num_item num_item = tmp if method == 'cosine': similar_coeff = cosine_similarity(centered_ratings) elif method == 'pearson': result = [] for i in centered_ratings.toarray(): result.append(i - np.average(i)) similar_coeff = cosine_similarity(result) elif method == 'jaccard': similar_coeff = 1 - pairwise_distances(centered_ratings.toarray(), metric="hamming") if based == 'user': item_users = item_users.transpose().tocsr() if mode == 'Topn': if targets is None: targets = user_encoder.classes_ if table_user_col.dtype in (np.floating, float, np.int, int, np.int64): targets = [float(i) for i in targets] targets_en = user_encoder.transform(targets) Topn_result = [] if workers == 1: for user in targets_en: recommendations_corre = _recommend(user, item_users, similar_coeff, N, k, method, weighted, centered, based, normalize, user_avg, filter, filter_minus, maintain_already_scored) recommendations = [] for (item, rating) in recommendations_corre: recommendations += [ item_encoder.inverse_transform([item])[0], rating ] Topn_result += [recommendations] else: Topn_result_tmp = apply_by_multiprocessing_list_to_list( targets_en, _recommend_multi, item_users=item_users, similar_coeff=similar_coeff, N=N, k=k, method=method, weighted=weighted, centered=centered, based=based, normalize=normalize, user_avg=user_avg, item_encoder=item_encoder, workers=workers, filter_minus=filter_minus, maintain_already_scored=maintain_already_scored) Topn_result = [] for i in range(workers): Topn_result += Topn_result_tmp[i] Topn_result = pd.DataFrame(Topn_result) Topn_result = pd.concat([pd.DataFrame(targets), Topn_result], axis=1, ignore_index=True) column_names = ['user_name'] for i in range(int((Topn_result.shape[1] - 1) / 2)): column_names += ['item_top%d' % (i + 1), 'rating_top%d' % (i + 1)] Topn_result.columns = column_names return {'out_table': Topn_result} parameters = dict() parameters['Number of Neighbors'] = k parameters['Based'] = based if method == 'cosine': parameters['Similarity method'] = 'Cosine' elif method == 'jaccard': parameters['Similarity method'] = 'Jaccard' elif method == 'pearson': parameters['Similarity method'] = 'Pearson' else: parameters['Similarity method'] = 'Adjusted Cosine' parameters['Use Centered Mean'] = centered parameters['Use Weighted Rating'] = weighted rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Collaborative Filtering Result | | ### Parameters | {parameters} | """.format(parameters=dict2MD(parameters)))) model = _model_dict('collaborative filtering') model['weighted'] = weighted model['k'] = k model['method'] = method model['centered_ratings'] = centered_ratings model['item_encoder'] = item_encoder model['user_encoder'] = user_encoder model['item_users'] = item_users model['user_col'] = user_col model['item_col'] = item_col model['based'] = based model['_repr_brtc_'] = rb.get() model['normalize'] = normalize model['user_avg'] = user_avg return {'model': model}
def _doc2vec(table, input_col, dm=1, vector_size=100, window=10, min_count=1, max_vocab_size=None, train_epoch=100, workers=4, alpha=0.025, min_alpha=0.025, seed=None, hs=1, negative=5, ns_exponent=0.75): if seed is None: random_state = seed seed = randint(0, 0xffffffff) else: random_state = seed docs = table[input_col] tagged_docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs)] if dm == "1": dm = 1 algo = 'PV-DM' else: dm = 0 algo = 'PV-DBOW' d2v = Doc2Vec(documents=tagged_docs, dm=dm, vector_size=vector_size, window=window, alpha=alpha, min_alpha=min_alpha, seed=seed, min_count=min_count, max_vocab_size=max_vocab_size, workers=workers, epochs=train_epoch, hs=hs, negative=negative, ns_exponent=ns_exponent) vocab = d2v.wv.vocab params = { 'Input column': input_col, 'Training algorithm': algo, 'Dimension of Vectors': vector_size, 'Window': window, 'Minimum count': min_count, 'Max vocabulary size': max_vocab_size, 'Train epoch': train_epoch, 'Number of workers': workers, 'Alpha': alpha, 'Minimum alpha': min_alpha, 'Seed': random_state, 'Hierarchical softmax': hs, 'Negative': negative, 'Negative sampling exponent': ns_exponent } rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Doc2Vec Result | | ### Parameters | {params} """.format(params=dict2MD(params)))) model = _model_dict('doc2vec_model') model['params'] = params model['d2v'] = d2v model['_repr_brtc_'] = rb.get() out_table1 = table.copy() out_table1['document_vectors'] = [ d2v.infer_vector(doc.words).tolist() for doc in tagged_docs ] out_table2 = pd.DataFrame({ 'words': d2v.wv.index2word, 'word_vectors': d2v.wv[vocab].tolist() }) return {'model': model, 'doc_table': out_table1, 'word_table': out_table2}
def _penalized_linear_regression_train(table, feature_cols, label_col, regression_type='ridge', alpha=1.0, l1_ratio=0.5, fit_intercept=True, max_iter=1000, tol=0.0001, random_state=None): out_table = table.copy() features = out_table[feature_cols] label = out_table[label_col] if regression_type == 'ridge': regression_model = Ridge(alpha=alpha, fit_intercept=fit_intercept, max_iter=None, tol=tol, solver='auto', random_state=random_state) elif regression_type == 'lasso': regression_model = Lasso(alpha=alpha, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, random_state=random_state, selection='random') elif regression_type == 'elastic_net': regression_model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, random_state=random_state, selection='random') else: raise_runtime_error("Please check 'regression_type'.") regression_model.fit(features, label) out_table1 = pd.DataFrame([]) out_table1['x_variable_name'] = [variable for variable in feature_cols] out_table1['coefficient'] = regression_model.fit(features, label).coef_ intercept = pd.DataFrame( [['intercept', regression_model.fit(features, label).intercept_]], columns=['x_variable_name', 'coefficient']) if fit_intercept == True: out_table1 = out_table1.append(intercept, ignore_index=True) predict = regression_model.predict(features) residual = label - predict out_table['predict'] = predict out_table['residual'] = residual if regression_type == 'elastic_net': params = { 'Feature Columns': feature_cols, 'Label Column': label_col, 'Regression Type': regression_type, 'Regularization (Penalty Weight)': alpha, 'L1 Ratio': l1_ratio, 'Fit Intercept': fit_intercept, 'Maximum Number of Iterations': max_iter, 'Tolerance': tol } else: params = { 'Feature Columns': feature_cols, 'Label Column': label_col, 'Regression Type': regression_type, 'Regularization (Penalty Weight)': alpha, 'Fit Intercept': fit_intercept, 'Maxium Number of Iterations': max_iter, 'Tolerance': tol } score = { 'MSE': mean_squared_error(label, predict), 'R2': r2_score(label, predict) } plt.figure() plt.scatter(predict, label) plt.xlabel('Predicted values for ' + label_col) plt.ylabel('Actual values for ' + label_col) x = predict p1x = np.min(x) p2x = np.max(x) plt.plot([p1x, p2x], [p1x, p2x], 'r--') fig_actual_predict = plt2MD(plt) plt.clf() plt.figure() plt.scatter(predict, residual) plt.xlabel('Predicted values for ' + label_col) plt.ylabel('Residuals') plt.axhline(y=0, color='r', linestyle='--') fig_residual_1 = plt2MD(plt) plt.clf() plt.figure() sm.qqplot(residual, line='s') plt.ylabel('Residuals') fig_residual_2 = plt2MD(plt) plt.clf() plt.figure() sns.distplot(residual) plt.xlabel('Residuals') fig_residual_3 = plt2MD(plt) plt.clf() # checking the magnitude of coefficients plt.figure() predictors = features.columns coef = Series(regression_model.coef_, predictors).sort_values() coef.plot(kind='bar', title='Model Coefficients') plt.tight_layout() fig_model_coefficients = plt2MD(plt) plt.clf() rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | # Penalized Linear Regression Result | ### Selected Parameters: | {params} | | ## Results | ### Model Parameters | {out_table1} | | ### Prediction and Residual | {out_table2} | | ### Regression Score | {score} | """.format(params=dict2MD(params), out_table1=pandasDF2MD(out_table1), out_table2=pandasDF2MD(out_table, num_rows=len(out_table) + 1), score=dict2MD(score)))) rb.addMD( strip_margin(""" | | ### Predicted vs Actual | {image1} | | ### Fit Diagnostics | {image2} | {image3} | {image4} | | ### Magnitude of Coefficients | {image5} | """.format(image1=fig_actual_predict, image2=fig_residual_1, image3=fig_residual_2, image4=fig_residual_3, image5=fig_model_coefficients))) model = _model_dict('penalized_linear_regression_model') model['feature_cols'] = feature_cols model['label_col'] = label_col model['regression_type'] = regression_type model['regression_model'] = regression_model model['parameters'] = params model['model_parameters'] = out_table1 model['prediction_residual'] = out_table model['_repr_brtc_'] = rb.get() return {'model': model}
def _gaussian_mixture_train(table, input_cols, number_of_components=1, covariance_type='full', tolerance=0.001, \ regularize_covariance=1e-06, max_iteration=100, initial_params='kmeans', seed=None): gmm = GaussianMixture(n_components=number_of_components, covariance_type=covariance_type, tol=tolerance, \ reg_covar=regularize_covariance, max_iter=max_iteration, init_params=initial_params, random_state=seed) X_train = table[input_cols] gmm.fit(X_train) out_table = pd.DataFrame() comp_num_arr = [] for i in range(0, number_of_components): comp_num_arr.append(i) mean_arr = [] for i in range(0, number_of_components): mean_arr.append(gmm.means_[i].tolist()) covar_arr = [] for i in range(0, number_of_components): covar_arr.append(gmm.covariances_[i].tolist()) out_table['component_number'] = comp_num_arr out_table['weight'] = gmm.weights_ out_table['mean_coordinate'] = mean_arr out_table['covariance_matrix'] = covar_arr rb = BrtcReprBuilder() params = { 'Input Columns': input_cols, 'Number of Components': number_of_components, 'Covariance Type': covariance_type, 'Tolerance': tolerance, 'Regularization of Covariance': regularize_covariance, 'Number of Iteration': max_iteration, 'Method to Initialize': initial_params } rb.addMD(strip_margin(""" |## Gaussian Mixture Train Result | |### Parameters | | {params} | |### Summary | |{result_table} | """.format(params=dict2MD(params), result_table=pandasDF2MD(out_table)))) model = _model_dict('gaussian_mixture_train') model['input_cols'] = input_cols model['number_of_components'] = number_of_components model['covariance_type'] = covariance_type model['tolerance'] = tolerance model['regularize_covariance'] = regularize_covariance model['max_iteration'] = max_iteration model['initial_params'] = initial_params model['seed'] = seed model['summary'] = out_table model['gmm'] = gmm model['_repr_brtc_'] = rb.get() return {'model':model}
def _naive_bayes_train(table, feature_cols, label_col, alpha=1.0, fit_prior=True, class_prior=None): features = table[feature_cols] label = table[label_col] label_encoder = preprocessing.LabelEncoder() label_encoder.fit(label) label_correspond = label_encoder.transform(label) if class_prior is not None: tmp_class_prior = [0] * len(class_prior) for elems in class_prior: tmp = elems.split(":") tmp_class_prior[label_encoder.transform([tmp[0] ])[0]] = float(tmp[1]) class_prior = tmp_class_prior nb_model = MultinomialNB(alpha, fit_prior, class_prior) nb_model.fit(features, label_correspond) class_log_prior = nb_model.class_log_prior_ feature_log_prob_ = nb_model.feature_log_prob_ tmp_result = np.hstack( (list(map(list, zip(*[label_encoder.classes_] + [class_log_prior]))), (feature_log_prob_))) column_names = ['labels', 'pi'] for feature_col in feature_cols: column_names += ['theta_' + feature_col] result_table = pd.DataFrame.from_records(tmp_result, columns=column_names) prediction_correspond = nb_model.predict(features) get_param = dict() get_param['Lambda'] = alpha # get_param['Prior Probabilities of the Classes'] = class_prior get_param['Fit Class Prior Probability'] = fit_prior get_param['Feature Columns'] = feature_cols get_param['Label Column'] = label_col cnf_matrix = confusion_matrix(label_correspond, prediction_correspond) plt.figure() _plot_confusion_matrix(cnf_matrix, classes=label_encoder.classes_, title='Confusion Matrix') fig_confusion_matrix = plt2MD(plt) accuracy = nb_model.score(features, label_correspond) * 100 rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Naive Bayes Classification Result | | ### Model:Multinomial | {result_table} | ### Parameters | {table_parameter} | ### Predicted vs Actual | {image1} | #### Accuacy = {accuracy}% | """.format(image1=fig_confusion_matrix, accuracy=accuracy, result_table=pandasDF2MD(result_table), table_parameter=dict2MD(get_param)))) model = _model_dict('naive_bayes_model') model['features'] = feature_cols model['label_col'] = label_col model['label_encoder'] = label_encoder model['nb_model'] = nb_model model['_repr_brtc_'] = rb.get() return {'model': model}
def _svm_classification_train(table, feature_cols, label_col, gamma_val, c=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=True, tol=1e-3, max_iter=-1, random_state=None, class_weight=None): _table = table.copy() feature_names, features = check_col_type(table, feature_cols) _label_col = _table[label_col] if (sklearn_utils.multiclass.type_of_target(_label_col) == 'continuous'): raise_runtime_error('''Label Column should not be continuous.''') class_labels = sorted(set(_label_col)) if class_weight is not None: if len(class_weight) != len(class_labels): raise ValueError( "Number of class weights should match number of labels.") else: class_weight = { class_labels[i]: class_weight[i] for i in range(len(class_labels)) } if gamma == 'other': _gamma = gamma_val else: _gamma = gamma _svc = svm.SVC(C=c, kernel=kernel, degree=degree, gamma=_gamma, coef0=coef0, shrinking=shrinking, probability=probability, tol=tol, max_iter=max_iter, random_state=random_state, class_weight=class_weight) _svc_model = _svc.fit(features, _label_col) get_param = _svc.get_params() get_param['feature_cols'] = feature_names get_param['label_col'] = label_col rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## SVM Classification Result | ### Parameters | {table_parameter} """.format(table_parameter=dict2MD(get_param)))) _model = _model_dict('svc_model') _model['svc_model'] = _svc_model _model['features'] = feature_cols _model['_repr_brtc_'] = rb.get() return {'model': _model}
def _logistic_regression_train(table, feature_cols, label_col, penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1): feature_names, features = check_col_type(table, feature_cols) features = pd.DataFrame(features, columns=feature_names) label = table[label_col] if (sklearn_utils.multiclass.type_of_target(label) == 'continuous'): raise_error('0718', 'label_col') lr_model = LogisticRegression(penalty, dual, tol, C, fit_intercept, intercept_scaling, class_weight, random_state, solver, max_iter, multi_class, verbose, warm_start, n_jobs) lr_model.fit(features, label) new_features = pd.DataFrame({ "Constant": np.ones(len(features)) }).join(pd.DataFrame(features)) intercept = lr_model.intercept_ coefficients = lr_model.coef_ classes = lr_model.classes_ is_binary = len(classes) == 2 prob = lr_model.predict_proba(features) prob_trans = prob.T classes_dict = dict() for i in range(len(classes)): classes_dict[classes[i]] = i tmp_label = np.array([classes_dict[i] for i in label]) likelihood = 1 for i in range(len(table)): likelihood *= prob_trans[tmp_label[i]][i] if fit_intercept: k = len(feature_cols) + 1 else: k = len(feature_cols) aic = 2 * k - 2 * np.log(likelihood) bic = np.log(len(table)) * k - 2 * np.log(likelihood) if is_binary: if fit_intercept: x_design = np.hstack([np.ones((features.shape[0], 1)), features]) else: x_design = features.values v = np.product(prob, axis=1) x_design_modi = (x_design.T * v).T cov_logit = np.linalg.inv(np.dot(x_design_modi.T, x_design)) std_err = np.sqrt(np.diag(cov_logit)) if fit_intercept: logit_params = np.insert(coefficients, 0, intercept) else: logit_params = coefficients wald = (logit_params / std_err)**2 p_values = 1 - chi2.cdf(wald, 1) else: if fit_intercept: x_design = np.hstack([np.ones((features.shape[0], 1)), features]) else: x_design = features.values std_err = [] for i in range(len(classes)): v = prob.T[i] * (1 - prob.T[i]) x_design_modi = (x_design.T * v).T cov_logit = np.linalg.inv(np.dot(x_design_modi.T, x_design)) std_err.append(np.sqrt(np.diag(cov_logit))) std_err = np.array(std_err) #print(math.log(likelihood)) if (fit_intercept == True): summary = pd.DataFrame({'features': ['intercept'] + feature_names}) coef_trans = np.concatenate(([intercept], np.transpose(coefficients)), axis=0) else: summary = pd.DataFrame({'features': feature_names}) coef_trans = np.transpose(coefficients) if not is_binary: summary = pd.concat( (summary, pd.DataFrame(coef_trans, columns=classes)), axis=1) else: summary = pd.concat( (summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1) if is_binary: summary = pd.concat( (summary, pd.DataFrame(std_err, columns=['standard_error']), pd.DataFrame(wald, columns=['wald_statistic']), pd.DataFrame(p_values, columns=['p_value'])), axis=1) else: columns = [ 'standard_error_{}'.format(classes[i]) for i in range(len(classes)) ] summary = pd.concat( (summary, pd.DataFrame(std_err.T, columns=columns)), axis=1) arrange_col = ['features'] for i in range(len(classes)): arrange_col.append(classes[i]) arrange_col.append('standard_error_{}'.format(classes[i])) summary = summary[arrange_col] if is_binary: rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Logistic Regression Result | ### Summary | {table1} | | ##### Column '{small}' is the coefficients under the assumption ({small} = 0, {big} = 1). | | #### AIC : {aic} | | #### BIC : {bic} """.format(small=classes[0], big=classes[1], table1=pandasDF2MD(summary, num_rows=100), aic=aic, bic=bic))) else: rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Logistic Regression Result | ### Summary | {table1} | | ##### Each column whose name is one of classes of Label Column is the coefficients under the assumption it is 1 and others are 0. | | ##### For example, column '{small}' is the coefficients under the assumption ({small} = 1, others = 0). | | #### AIC : {aic} | | #### BIC : {bic} """.format(small=classes[0], table1=pandasDF2MD(summary, num_rows=100), aic=aic, bic=bic))) model = _model_dict('logistic_regression_model') model['standard_errors'] = std_err model['aic'] = aic model['bic'] = bic if is_binary: model['wald_statistics'] = wald model['p_values'] = p_values model['features'] = feature_cols model['label'] = label_col model['intercept'] = lr_model.intercept_ model['coefficients'] = lr_model.coef_ model['class'] = lr_model.classes_ model['penalty'] = penalty model['solver'] = solver model['lr_model'] = lr_model model['_repr_brtc_'] = rb.get() model['summary'] = summary return {'model': model}
def _kmeans_silhouette_train_predict(table, input_cols, n_clusters_list=range(2, 10), prediction_col='prediction', init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto', seed=None, n_jobs=1, algorithm='auto', n_samples=None): if n_samples is None: n_samples = len(table) inputarr = table[input_cols] validate(all_elements_greater_than(n_clusters_list, 1, 'n_clusters_list'), greater_than_or_equal_to(n_init, 1, 'n_init'), greater_than_or_equal_to(max_iter, 1, 'max_iter'), greater_than(tol, 0.0, 'tol'), greater_than_or_equal_to(n_jobs, 1, 'n_jobs'), greater_than_or_equal_to(n_samples, 0, 'n_samples')) pca2_model = PCA(n_components=2).fit(inputarr) pca2 = pca2_model.transform(inputarr) silhouette_list = [] silouette_samples_list = [] models = [] centers_list = [] images = [] for k in n_clusters_list: k_means = SKKMeans(n_clusters=k, init=init, n_init=n_init, max_iter=max_iter, tol=tol, precompute_distances=precompute_distances, verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm) k_means.fit(inputarr) models.append(k_means) predict = k_means.labels_ centersk = k_means.cluster_centers_ centers_list.append(centersk) score = silhouette_score(inputarr, predict) silhouette_list.append(score) samples = silhouette_samples(inputarr, predict) silouette_samples_list.append(samples) pca2_centers = pca2_model.transform(centersk) _, (ax1, ax2) = plt.subplots(1, 2) colors = cm.nipy_spectral(np.arange(k).astype(float) / k) y_lower = 0 for i, color in zip(range(k), colors): si = samples[predict == i] si.sort() sizei = si.shape[0] y_upper = y_lower + sizei ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, si, facecolor=color, edgecolor=color, alpha=0.7) y_lower = y_upper ax2.scatter(pca2[:, 0][predict == i], pca2[:, 1][predict == i], color=color) ax1.axvline(x=score, color="red") ax2.scatter(pca2_centers[:, 0], pca2_centers[:, 1], marker='x', edgecolors=1, s=200, color=colors) imagek = plt2MD(plt) plt.clf() images.append(imagek) argmax = np.argmax(silhouette_list) best_k = n_clusters_list[argmax] best_model = models[argmax] predict = best_model.predict(inputarr) best_centers = best_model.cluster_centers_ best_labels = best_model.labels_ fig_centers = _kmeans_centers_plot(input_cols, best_centers) fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, best_centers) fig_pca = _kmeans_pca_plot(predict, best_centers, pca2_model, pca2) x_clusters = range(len(n_clusters_list)) plt.xticks(x_clusters, n_clusters_list) plt.plot(x_clusters, silhouette_list, '.-') fig_silhouette = plt2MD(plt) plt.clf() rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Kmeans Silhouette Result | - silloutte metrics: | {fig_silhouette} | - best K: {best_k} | - best centers: | {fig_pca} | {fig_centers} | {fig_samples} | """.format(fig_silhouette=fig_silhouette, best_k=best_k, fig_pca=fig_pca, fig_centers=fig_centers, fig_samples=fig_samples))) for k, image in zip(n_clusters_list, images): rb.addMD( strip_margin(""" | ### k = {k} | {image} | """.format(k=k, image=image))) model = _model_dict('kmeans_silhouette') model['best_k'] = best_k model['best_centers'] = best_centers model['best_model'] = best_model model['input_cols'] = input_cols model['_repr_brtc_'] = rb.get() out_table = table.copy() out_table[prediction_col] = predict return {'out_table': out_table, 'model': model}
def _svd2(table, input_cols, new_column_name='projected_', full_matrices=False): A = table[input_cols] u, s, vh = np.linalg.svd(A, full_matrices=full_matrices) projection = [] for i in range(len(s)): projection += [(u.T[i] * s[i])] projection = np.array(projection).T s_normal = [] for i in range(len(s)): if i == 0: s_normal += [s[i] / s.sum()] else: s_normal += [s[i] / s.sum() + s_normal[i - 1]] s = [s] + [s_normal] s = np.array(s) v = vh.T column_name_u = [] column_name_s = [] column_name_v = [] column_name_projection = [] for i in range(u.shape[1]): column_name_u += ['u%d' % (i + 1)] for i in range(s.shape[1]): column_name_s += ['s%d' % (i + 1)] for i in range(v.shape[1]): column_name_v += ['v%d' % (i + 1)] for i in range(s.shape[1]): column_name_projection += [new_column_name + '%d' % (i + 1)] out_table4 = pd.DataFrame(data=projection, columns=[column_name_projection]) out_table4 = pd.concat([table.reset_index(drop=True), out_table4], axis=1) out_table4.columns = table.columns.values.tolist() + column_name_projection res_param1 = {} res_param1['Input Columns'] = input_cols res_param1['full_matrices'] = full_matrices res_param2 = {} res_param2['u'] = u.shape res_param2['s'] = s.shape res_param2['v'] = v.shape res_param2['Projected Matrix'] = projection.shape rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## SVD Result | | ### Dimensions of Matrices | {parameter2} | | ### Parameters | {parameter1} """.format(parameter1=dict2MD(res_param1), parameter2=dict2MD(res_param2)))) model = _model_dict('svd') model['right_singular_vectors'] = pd.DataFrame(v, columns=column_name_v) model['input_cols'] = input_cols model['parameters'] = res_param1 model['_repr_brtc_'] = rb.get() return { 'out_table1': pd.DataFrame(u, columns=column_name_u), 'out_table2': pd.DataFrame(s, columns=column_name_s), 'out_table3': pd.DataFrame(v, columns=column_name_v), 'out_table4': out_table4, 'model': model }
def _kmeans_train_predict(table, input_cols, n_clusters=3, prediction_col='prediction', init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto', seed=None, n_jobs=1, algorithm='auto', n_samples=None): inputarr = table[input_cols] if n_samples is None: n_samples = len(inputarr) validate(greater_than_or_equal_to(n_clusters, 1, 'n_clusters'), greater_than_or_equal_to(n_init, 1, 'n_init'), greater_than_or_equal_to(max_iter, 1, 'max_iter'), greater_than(tol, 0.0, 'tol'), greater_than_or_equal_to(n_jobs, 1, 'n_jobs'), greater_than_or_equal_to(n_samples, 0, 'n_samples')) k_means = SKKMeans(n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter, tol=tol, precompute_distances=precompute_distances, verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm) k_means.fit(inputarr) params = { 'input_cols': input_cols, 'n_clusters': n_clusters, 'init': init, 'n_init': n_init, 'max_iter': max_iter, 'tol': tol, 'precompute_distances': precompute_distances, 'seed': seed, 'n_jobs': n_jobs, 'algorithm': algorithm } cluster_centers = k_means.cluster_centers_ labels = k_means.labels_ pca2_model = PCA(n_components=2).fit(inputarr) pca2 = pca2_model.transform(inputarr) fig_centers = _kmeans_centers_plot(input_cols, cluster_centers) fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, cluster_centers) fig_pca = _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Kmeans Result | - Number of iterations run: {n_iter_}. | - Coordinates of cluster centers | {fig_cluster_centers} | - Samples | {fig_pca} | {fig_samples} | | ### Parameters | {params} """.format(n_iter_=k_means.n_iter_, fig_cluster_centers=fig_centers, fig_pca=fig_pca, fig_samples=fig_samples, params=dict2MD(params)))) model = _model_dict('kmeans') model['model'] = k_means model['input_cols'] = input_cols model['_repr_brtc_'] = rb.get() out_table = table.copy() out_table[prediction_col] = labels return {'out_table': out_table, 'model': model}
def _glm_train(table, feature_cols, label_col, family="Gaussian", link="ident", fit_intercept=True): features = table[feature_cols] label = table[label_col] if label_col in feature_cols: raise Exception("%s is duplicated." % label_col) if family == "Gaussian": sm_family = sm.families.Gaussian() elif family == "inv_Gaussian": sm_family = sm.families.InverseGaussian() elif family == "binomial": sm_family = sm.families.Binomial() elif family == "Poisson": sm_family = sm.families.Poisson() elif family == "neg_binomial": sm_family = sm.families.NegativeBinomial() elif family == "gamma": sm_family = sm.families.Gamma() elif family == "Tweedie": sm_family = sm.families.Tweedie() if link == "ident": sm_link = sm.families.links.identity elif link == "log": sm_link = sm.families.links.log elif link == "logit": sm_link = sm.families.links.logit elif link == "probit": sm_link = sm.families.links.probit elif link == "cloglog": sm_link = sm.families.links.cLogLog elif link == "pow": sm_link = sm.families.links.Power elif link == "nbinom": sm_link = sm.families.links.binom if fit_intercept == True: glm_model = sm.GLM(label, sm.add_constant(features), family=sm_family, link=sm_link).fit() else: glm_model = sm.GLM(label, features, family=sm_family, link=sm_link).fit() summary = glm_model.summary().as_html() rb = ReportBuilder() rb.addMD(strip_margin(""" | ## GLM Result | ### Summary | """)) rb.addHTML(summary) model = _model_dict('glm_model') model['features'] = feature_cols model['label'] = label_col model['family'] = family model['link'] = link model['coefficients'] = glm_model.params model['aic'] = glm_model.aic model['bic'] = glm_model.bic model['tvalues'] = glm_model.tvalues model['pvalues'] = glm_model.pvalues model['fit_intercept'] = fit_intercept model['glm_model'] = glm_model model['report'] = rb.get() return {'model' : model}
def _xgb_classification_train(table, feature_cols, label_col, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective='binary:logistic', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None, sample_weight_eval_set=None): validate(greater_than_or_equal_to(max_depth, 1, 'max_depth'), greater_than_or_equal_to(learning_rate, 0.0, 'learning_rate'), greater_than_or_equal_to(n_estimators, 1, 'n_estimators')) classifier = XGBClassifier(max_depth, learning_rate, n_estimators, silent, objective, booster, n_jobs, nthread, gamma, min_child_weight, max_delta_step, subsample, colsample_bytree, colsample_bylevel, reg_alpha, reg_lambda, scale_pos_weight, base_score, random_state, seed, missing) classifier.fit(table[feature_cols], table[label_col], sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set) # json get_param = classifier.get_params() feature_importance = classifier.feature_importances_ # plt.rcdefaults() plot_importance(classifier) plt.tight_layout() fig_plot_importance = plt2MD(plt) plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(classifier) # fig_plot_tree_UT = plt2MD(plt) # plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(classifier, rankdir='LR') # fig_plot_tree_LR = plt2MD(plt) # plt.rcdefaults() # plt.clf() model = _model_dict('xgb_classification_model') model['feature_cols'] = feature_cols model['label_col'] = label_col model['parameters'] = get_param model['feature_importance'] = feature_importance model['classifier'] = classifier # report # get_param_list = [] # get_param_list.append(['feature_cols', feature_cols]) # get_param_list.append(['label_col', label_col]) params = dict2MD(get_param) # for key, value in get_param.items(): # temp = [key, value] # get_param_list.append(temp) # get_param_df = pd.DataFrame(data=get_param_list, columns=['parameter', 'value']) feature_importance_df = pd.DataFrame(data=feature_importance, index=feature_cols).T rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## XGB Classification Train Result | | ### Plot Importance | {fig_importance} | | ### Feature Importance | {table_feature_importance} | | ### Parameters | {list_parameters} | """.format(fig_importance=fig_plot_importance, table_feature_importance=pandasDF2MD(feature_importance_df, 20), list_parameters=params))) model['_repr_brtc_'] = rb.get() return {'model': model}
def _pca(table, input_cols, new_column_name='projected_', n_components=None, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None, hue=None, alpha=0, key_col=None): num_feature_cols = len(input_cols) if n_components is None: n_components = num_feature_cols validate(greater_than_or_equal_to(n_components, 1, 'n_components')) pca = PCA(None, copy, whiten, svd_solver, tol, iterated_power, random_state) pca_model = pca.fit(table[input_cols]) column_names = [] for i in range(0, n_components): column_names.append(new_column_name + str(i)) # print(column_names) pca_result = pca_model.transform(table[input_cols]) out_df = pd.DataFrame(data=pca_result[:, :n_components], columns=[column_names]) out_df = pd.concat([table.reset_index(drop=True), out_df], axis=1) out_df.columns = table.columns.values.tolist() + column_names res_components = pca_model.components_ res_components_df = pd.DataFrame(data=res_components[:n_components], columns=[input_cols]) res_explained_variance = pca_model.explained_variance_ res_explained_variance_ratio = pca_model.explained_variance_ratio_ res_singular_values = pca_model.singular_values_ res_mean = pca_model.mean_ res_n_components = pca_model.n_components_ res_noise_variance = pca_model.noise_variance_ res_get_param = pca_model.get_params() res_get_covariance = pca_model.get_covariance() res_get_precision = pca_model.get_precision() # visualization plt.figure() if n_components == 1: sns.scatterplot(column_names[0], column_names[0], hue=hue, data=out_df) plt_two = plt2MD(plt) plt.clf() else: plt_two = _biplot( 0, 1, pc_columns=column_names, columns=input_cols, singular_values=res_singular_values, components=res_components, explained_variance_ratio=res_explained_variance_ratio, alpha=alpha, hue=hue, data=out_df, ax=plt.gca(), key_col=key_col) plt.figure() fig_scree = _screeplot(res_explained_variance, res_explained_variance_ratio, n_components) table_explained_variance = pd.DataFrame(res_explained_variance, columns=['explained_variance']) table_explained_variance[ 'explained_variance_ratio'] = res_explained_variance_ratio table_explained_variance[ 'cum_explained_variance_ratio'] = res_explained_variance_ratio.cumsum( ) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## PCA Result | ### Plot | {image1} | | ### Explained Variance | {fig_scree} | {table_explained_variance} | | ### Components | {table2} | | ### Parameters | {parameter1} """.format(image1=plt_two, fig_scree=fig_scree, table_explained_variance=pandasDF2MD(table_explained_variance), parameter1=dict2MD(res_get_param), table2=pandasDF2MD(res_components_df)))) model = _model_dict('pca') model['components'] = res_components model['explained_variance'] = res_explained_variance model['explained_variance_ratio'] = res_explained_variance_ratio model['singular_values'] = res_singular_values model['mean'] = res_mean model['n_components'] = res_n_components model['noise_variance'] = res_noise_variance model['parameters'] = res_get_param model['covariance'] = res_get_covariance model['precision'] = res_get_precision model['_repr_brtc_'] = rb.get() model['pca_model'] = pca_model model['input_cols'] = input_cols return {'out_table': out_df, 'model': model}
def _doc2vec(table, input_col, dm=1, vector_size=100, window=10, min_count=1, max_vocab_size=None, train_epoch=100, workers=1, alpha=0.025, min_alpha=0.025, seed=None, hs=1, negative=5, ns_exponent=0.75, topn=30, hashfxn=hash): if seed is None: random_state = seed seed = randint(0, 0xffffffff) else: random_state = seed docs = table[input_col] tagged_docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs)] # hs = 1 if hs is True else 0 if isinstance(dm, str): dm = int(dm) algo = {1: 'PV-DM', 0: 'PV_DBOW'}[dm] d2v = Doc2Vec(documents=tagged_docs, dm=dm, vector_size=vector_size, window=window, alpha=alpha, min_alpha=min_alpha, seed=seed, min_count=min_count, max_vocab_size=max_vocab_size, workers=workers, epochs=train_epoch, hs=hs, negative=negative, ns_exponent=ns_exponent, hashfxn=hashfxn) vocab = d2v.wv.vocab params = { 'Input column': input_col, 'Training algorithm': algo, 'Dimension of Vectors': vector_size, 'Window': window, 'Minimum count': min_count, 'Max vocabulary size': max_vocab_size, 'Train epoch': train_epoch, 'Number of workers': workers, 'Alpha': alpha, 'Minimum alpha': min_alpha, 'Seed': random_state, 'Hierarchical softmax': hs, 'Negative': negative, 'Negative sampling exponent': ns_exponent } # tsne visualization length = len(vocab) if length < topn: topn = length topn_words = sorted(vocab, key=vocab.get, reverse=True)[:topn] X = d2v[topn_words] tsne = TSNE(n_components=min(2, topn), random_state=seed) X_tsne = tsne.fit_transform(X) df = pd.DataFrame(X_tsne, index=topn_words, columns=['x', 'y']) fig = plt.figure() fig.set_size_inches(50, 40) ax = fig.add_subplot(1, 1, 1) ax.scatter(df['x'], df['y'], s=1000) ax.tick_params(axis='both', which='major', labelsize=50) for word, pos in df.iterrows(): ax.annotate(word, pos, fontsize=80) plt.show() fig = plt2MD(plt) plt.clf() rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Doc2Vec Result | | ### Total Number of words | {length} | | ### Top {topn} Words | {topn_words} | {fig} | | ### Parameters | {params} """.format(length=length, topn=topn, topn_words=topn_words, fig=fig, params=dict2MD(params)))) model = _model_dict('doc2vec_model') model['params'] = params model['d2v'] = d2v model['_repr_brtc_'] = rb.get() model['hash_val(Brightics)'] = hashfxn('Brightics') out_table1 = table.copy() out_table1['document_vectors'] = [ d2v.infer_vector(doc.words).tolist() for doc in tagged_docs ] out_table2 = pd.DataFrame({ 'words': d2v.wv.index2word, 'word_vectors': d2v.wv[vocab].tolist() }) return {'model': model, 'doc_table': out_table1, 'word_table': out_table2}
def _logistic_regression_train(table, feature_cols, label_col, penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1): feature_names, features = check_col_type(table, feature_cols) label = table[label_col] if (sklearn_utils.multiclass.type_of_target(label) == 'continuous'): raise_error('0718', 'label_col') lr_model = LogisticRegression(penalty, dual, tol, C, fit_intercept, intercept_scaling, class_weight, random_state, solver, max_iter, multi_class, verbose, warm_start, n_jobs) lr_model.fit(features, label) intercept = lr_model.intercept_ coefficients = lr_model.coef_ classes = lr_model.classes_ is_binary = len(classes) == 2 if (fit_intercept == True): summary = pd.DataFrame({'features': ['intercept'] + feature_names}) print(intercept) print(coefficients) coef_trans = np.concatenate(([intercept], np.transpose(coefficients)), axis=0) if not is_binary: summary = pd.concat( (summary, pd.DataFrame(coef_trans, columns=classes)), axis=1) elif is_binary: summary = pd.concat( (summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1) else: summary = pd.DataFrame({'features': feature_names}) coef_trans = np.transpose(coefficients) if not is_binary: summary = pd.concat( (summary, pd.DataFrame(coef_trans, columns=classes)), axis=1) elif is_binary: summary = pd.concat( (summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Logistic Regression Result | ### Summary | {table1} """.format(table1=pandasDF2MD(summary)))) model = _model_dict('logistic_regression_model') model['features'] = feature_cols model['label'] = label_col model['intercept'] = lr_model.intercept_ model['coefficients'] = lr_model.coef_ model['class'] = lr_model.classes_ model['penalty'] = penalty model['solver'] = solver model['lr_model'] = lr_model model['_repr_brtc_'] = rb.get() model['summary'] = summary return {'model': model}
def _linear_regression_train(table, feature_cols, label_col, fit_intercept=True): features = table[feature_cols] label = table[label_col] lr_model = LinearRegression(fit_intercept) lr_model.fit(features, label) predict = lr_model.predict(features) residual = label - predict if fit_intercept == True: lr_model_fit = sm.OLS(label, sm.add_constant(features)).fit() else: lr_model_fit = sm.OLS(label, features).fit() summary = lr_model_fit.summary().as_html() plt.figure() plt.scatter(predict, label) plt.xlabel('Predicted values for ' + label_col) plt.ylabel('Actual values for ' + label_col) x = predict y = np.array(label) a = x.size b = np.sum(x) c = b d = 0 for i in x: d += +i * i e = np.sum(y) f = 0 for i in range(0, x.size - 1): f += x[i] * y[i] det = a * d - b * c aa = (d * e - b * f) / det bb = (a * f - c * e) / det p1x = np.min(x) p1y = aa + bb * p1x p2x = np.max(x) p2y = aa + bb * p2x plt.plot([p1x, p2x], [p1y, p2y], 'r--') fig_actual_predict = plt2MD(plt) plt.figure() plt.scatter(predict, residual) plt.xlabel('Predicted values for ' + label_col) plt.ylabel('Residuals') plt.axhline(y=0, color='r', linestyle='--') fig_residual_1 = plt2MD(plt) plt.figure() sm.qqplot(residual, line='s') plt.ylabel('Residuals') fig_residual_2 = plt2MD(plt) plt.figure() sns.distplot(residual) plt.xlabel('Residuals') fig_residual_3 = plt2MD(plt) rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Linear Regression Result | ### Summary | """)) rb.addHTML(summary) rb.addMD( strip_margin(""" | | ### Predicted vs Actual | {image1} | | ### Fit Diagnostics | {image2} | {image3} | {image4} """.format(image1=fig_actual_predict, image2=fig_residual_1, image3=fig_residual_2, image4=fig_residual_3))) model = _model_dict('linear_regression_model') model['features'] = feature_cols model['label'] = label_col model['coefficients'] = lr_model_fit.params model['r2'] = lr_model_fit.rsquared model['adjusted_r2'] = lr_model_fit.rsquared_adj model['aic'] = lr_model_fit.aic model['bic'] = lr_model_fit.bic model['f_static'] = lr_model_fit.fvalue model['tvalues'] = lr_model_fit.tvalues model['pvalues'] = lr_model_fit.pvalues model['lr_model'] = lr_model model['report'] = rb.get() return {'model': model}