def _knn_classification(train_table, test_table, feature_cols, label_col, k=5, algorithm='auto', leaf_size=30, p=2, pred_col_name='prediction', prob_col_prefix='probability', suffix='index'): _, X_train = check_col_type(train_table, feature_cols) y_train = train_table[label_col] _, X_test = check_col_type(test_table, feature_cols) if (sklearn_utils.multiclass.type_of_target(y_train) == 'continuous'): raise_error('0718', 'label_col') knn = KNeighborsClassifier(n_neighbors=k, algorithm=algorithm, leaf_size=leaf_size, p=p) # Predict the class labels for the provided data knn.fit(X_train, y_train) classes = knn.classes_ if (test_table.shape[0] == 0): new_cols = test_table.columns.tolist() + [pred_col_name] if suffix == 'index': prob_cols = [ prob_col_prefix + '_{}'.format(i) for i in range(len(classes)) ] else: prob_cols = [prob_col_prefix + '_{}'.format(i) for i in classes] new_cols += prob_cols out_table = pd.DataFrame(columns=new_cols) return {'out_table': out_table} pred = knn.predict(X_test) out_col_pred = pd.DataFrame(pred, columns=[pred_col_name]) if suffix == 'index': suffixes = [i for i, _ in enumerate(classes)] else: suffixes = classes # Return probability estimates for the test data prob = knn.predict_proba(X_test) prob_col_name = [ '{prob_col_prefix}_{suffix}'.format(prob_col_prefix=prob_col_prefix, suffix=suffix) for suffix in suffixes ] out_col_prob = pd.DataFrame(data=prob, columns=prob_col_name) # Result out_table = pd.concat( [test_table.reset_index(drop=True), out_col_pred, out_col_prob], axis=1) return {'out_table': out_table}
def _logistic_regression_predict(table, model, prediction_col='prediction', prob_prefix='probability', output_log_prob=False, log_prob_prefix='log_probability', thresholds=None, suffix='index'): feature_cols = model['features'] feature_names, features = check_col_type(table, feature_cols) lr_model = model['lr_model'] classes = lr_model.classes_ len_classes = len(classes) is_binary = len_classes == 2 if thresholds is None: thresholds = np.array([1 / len_classes for _ in classes]) elif isinstance(thresholds, list): if len(thresholds) == 1 and is_binary and 0 < thresholds[0] < 1: thresholds = np.array([thresholds[0], 1 - thresholds[0]]) else: thresholds = np.array(thresholds) len_thresholds = len(thresholds) if len_classes > 0 and len_thresholds > 0 and len_classes != len_thresholds: # FN-0613='%s' must have length equal to the number of classes. raise_error('0613', ['thresholds']) prob = lr_model.predict_proba(features) prediction = pd.DataFrame(prob).apply( lambda x: classes[np.argmax(x / thresholds)], axis=1) out_table = table.copy() out_table[prediction_col] = prediction if suffix == 'index': suffixes = [i for i, _ in enumerate(classes)] else: suffixes = classes prob_cols = [ '{probability_col}_{suffix}'.format(probability_col=prob_prefix, suffix=suffix) for suffix in suffixes ] prob_df = pd.DataFrame(data=prob, columns=prob_cols) if output_log_prob: log_prob = lr_model.predict_log_proba(features) logprob_cols = [ '{log_probability_col}_{suffix}'.format( log_probability_col=log_prob_prefix, suffix=suffix) for suffix in suffixes ] logprob_df = pd.DataFrame(data=log_prob, columns=logprob_cols) out_table = pd.concat([out_table, prob_df, logprob_df], axis=1) else: out_table = pd.concat([out_table, prob_df], axis=1) return {'out_table': out_table}
def _logistic_regression_train(table, feature_cols, label_col, penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1): features = table[feature_cols] label = table[label_col] if(sklearn_utils.multiclass.type_of_target(label) == 'continuous'): raise_error('0718', 'label_col') lr_model = LogisticRegression(penalty, dual, tol, C, fit_intercept, intercept_scaling, class_weight, random_state, solver, max_iter, multi_class, verbose, warm_start, n_jobs) lr_model.fit(features, label) intercept = lr_model.intercept_ coefficients = lr_model.coef_ classes = lr_model.classes_ is_binary = len(classes) == 2 if (fit_intercept == True): summary = pd.DataFrame({'features': ['intercept'] + feature_cols}) print(intercept) print(coefficients) coef_trans = np.concatenate(([intercept], np.transpose(coefficients)), axis=0) if not is_binary: summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=classes)), axis=1) elif is_binary: summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1) else: summary = pd.DataFrame({'features': feature_cols}) coef_trans = np.transpose(coefficients) if not is_binary: summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=classes)), axis=1) elif is_binary: summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1) rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Logistic Regression Result | ### Summary | {table1} """.format(table1=pandasDF2MD(summary) ))) model = _model_dict('logistic_regression_model') model['features'] = feature_cols model['label'] = label_col model['intercept'] = lr_model.intercept_ model['coefficients'] = lr_model.coef_ model['class'] = lr_model.classes_ model['penalty'] = penalty model['solver'] = solver model['lr_model'] = lr_model model['_repr_brtc_'] = rb.get() return {'model' : model}
def _under_sampling(table, label_col, sampling_strategy='not majority', seed=None, estimator='KMeans', n_clusters=8, voting='auto', n_jobs=1): # Separate features and label features = table.drop([label_col], axis=1) y = table[label_col] if(sklearn_utils.multiclass.type_of_target(y) == 'continuous'): raise_error('0718', 'label_col') # Initialization label encoder lab_encoder = preprocessing.LabelEncoder() # Filter out categorical columns in features categorical_cols = [col for col in features.columns if features[col].dtypes == 'object'] # Transform categorical columns and add to the original features for cate_col in categorical_cols: features_encoder = lab_encoder.fit_transform(features[cate_col]) features[cate_col] = features_encoder # Transform label column with object type if (y.dtypes == 'object'): y_encoder = lab_encoder.fit_transform(y) else: y_encoder = y if (estimator == 'Kmeans'): estimator_model = KMeans(n_clusters=n_clusters) else: estimator_model = None # Process under sampling sm = ClusterCentroids(sampling_strategy=sampling_strategy, random_state=seed, estimator=estimator_model, voting=voting, n_jobs=n_jobs) X_res, y_res = sm.fit_resample(features, y_encoder) # Invert to original data if (y.dtypes == 'object'): y_decoder = lab_encoder.inverse_transform(y_res) else: y_decoder = y_res df = pd.DataFrame(data=X_res, columns=features.columns) for cate_col in categorical_cols: df[cate_col] = lab_encoder.inverse_transform(df[cate_col].astype('int32')) df1 = pd.DataFrame(data=y_decoder, columns=[label_col]) # Output result out_table = df.join(df1) return {'out_table' : out_table}
def check_col_type(table, feature_cols): test_table = table[feature_cols] if (check_list(test_table)): test_table = table[feature_cols[0]].tolist() feature_names = [ feature_cols[0] + '_{}'.format(i) for i in range(len(test_table[0])) ] return feature_names, test_table elif (check_all_numbers(test_table)): return feature_cols, test_table else: raise_error('0720', 'feature_cols')
def _SMOTE(table, label_col, sampling_strategy='not majority', seed=None, k_neighbors=5, m_neighbors=10, out_step=0.5, kind='regular', svm_estimator='svc', n_jobs=1): features = table.drop([label_col], axis=1) y = table[label_col] if (sklearn_utils.multiclass.type_of_target(y) == 'continuous'): raise_error('0718', 'label_col') lab_encoder = preprocessing.LabelEncoder() y_encoder = lab_encoder.fit_transform(y) if (kind == 'svm'): svc_model = svm.SVC() else: svc_model = None sm = SMOTE_LIB(sampling_strategy=sampling_strategy, random_state=seed, k_neighbors=k_neighbors, m_neighbors=m_neighbors, out_step=out_step, kind=kind, svm_estimator=svc_model, n_jobs=n_jobs) X_res, y_res = sm.fit_resample(features, y_encoder) y_decoder = lab_encoder.inverse_transform(y_res) df = pd.DataFrame(data=X_res, columns=features.columns) df1 = pd.DataFrame(data=y_decoder, columns=[label_col]) out_table = df.join(df1) return {'out_table': out_table}
def _decision_tree_classification_train( table, feature_cols, label_col, # fig_size=np.array([6.4, 4.8]), criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=False, sample_weight=None, check_input=True, X_idx_sorted=None): y_train = table[label_col] if (sklearn_utils.multiclass.type_of_target(y_train) == 'continuous'): raise_error('0718', 'label_col') classifier = DecisionTreeClassifier( criterion, splitter, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, random_state, max_leaf_nodes, min_impurity_decrease, min_impurity_split, class_weight, presort) classifier.fit(table[feature_cols], table[label_col], sample_weight, check_input, X_idx_sorted) try: from sklearn.externals.six import StringIO from sklearn.tree import export_graphviz import pydotplus dot_data = StringIO() export_graphviz(classifier, out_file=dot_data, feature_names=feature_cols, class_names=table[label_col].astype('str').unique(), filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) from brightics.common.repr import png2MD fig_tree = png2MD(graph.create_png()) except: fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer." # json model = _model_dict('decision_tree_classification_model') model['feature_cols'] = feature_cols model['label_col'] = label_col model['classes'] = classifier.classes_ feature_importance = classifier.feature_importances_ model['feature_importance'] = feature_importance model['max_features'] = classifier.max_features_ model['n_classes'] = classifier.n_classes_ model['n_features'] = classifier.n_features_ model['n_outputs'] = classifier.n_outputs_ model['tree'] = classifier.tree_ get_param = classifier.get_params() model['parameters'] = get_param model['classifier'] = classifier # report indices = np.argsort(feature_importance) sorted_feature_cols = np.array(feature_cols)[indices] plt.title('Feature Importances') plt.barh(range(len(indices)), feature_importance[indices], color='b', align='center') for i, v in enumerate(feature_importance[indices]): plt.text(v, i, " {:.2f}".format(v), color='b', va='center', fontweight='bold') plt.yticks(range(len(indices)), sorted_feature_cols) plt.xlabel('Relative Importance') plt.xlim(0, 1.1) plt.tight_layout() fig_feature_importances = plt2MD(plt) plt.clf() params = dict2MD(get_param) # Add tree plot rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Decision Tree Classification Train Result | ### Decision Tree | {fig_tree} | | ### Feature Importance | {fig_feature_importances} | | ### Parameters | {list_parameters} | """.format(fig_tree=fig_tree, fig_feature_importances=fig_feature_importances, list_parameters=params))) model['_repr_brtc_'] = rb.get() return {'model': model}
def raise_error(error_code, error_message_params, true_condition=False): common_validation.raise_error(error_code, error_message_params, true_condition)
def _logistic_regression_train(table, feature_cols, label_col, penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1): feature_names, features = check_col_type(table, feature_cols) label = table[label_col] if (sklearn_utils.multiclass.type_of_target(label) == 'continuous'): raise_error('0718', 'label_col') lr_model = LogisticRegression(penalty, dual, tol, C, fit_intercept, intercept_scaling, class_weight, random_state, solver, max_iter, multi_class, verbose, warm_start, n_jobs) lr_model.fit(features, label) new_features = pd.DataFrame({ "Constant": np.ones(len(features)) }).join(pd.DataFrame(features)) intercept = lr_model.intercept_ coefficients = lr_model.coef_ classes = lr_model.classes_ is_binary = len(classes) == 2 prob = lr_model.predict_proba(features) prob_trans = prob.T classes_dict = dict() for i in range(len(classes)): classes_dict[classes[i]] = i tmp_label = np.array([classes_dict[i] for i in label]) likelihood = 1 for i in range(len(table)): likelihood *= prob_trans[tmp_label[i]][i] if fit_intercept: k = len(feature_cols) + 1 else: k = len(feature_cols) aic = 2 * k - 2 * np.log(likelihood) bic = np.log(len(table)) * k - 2 * np.log(likelihood) if is_binary: if fit_intercept: x_design = np.hstack([np.ones((features.shape[0], 1)), features]) else: x_design = features.values v = np.product(prob, axis=1) x_design_modi = np.array( [x_design[i] * v[i] for i in range(len(x_design))]) cov_logit = np.linalg.inv(np.dot(x_design_modi.T, x_design)) std_err = np.sqrt(np.diag(cov_logit)) if fit_intercept: logit_params = np.insert(coefficients, 0, intercept) else: logit_params = coefficients wald = (logit_params / std_err)**2 p_values = 1 - chi2.cdf(wald, 1) else: if fit_intercept: x_design = np.hstack([np.ones((features.shape[0], 1)), features]) else: x_design = features.values std_err = [] for i in range(len(classes)): v = prob.T[i] * (1 - prob.T[i]) x_design_modi = np.array( [x_design[i] * v[i] for i in range(len(x_design))]) cov_logit = np.linalg.inv(np.dot(x_design_modi.T, x_design)) std_err.append(np.sqrt(np.diag(cov_logit))) std_err = np.array(std_err) #print(math.log(likelihood)) if (fit_intercept == True): summary = pd.DataFrame({'features': ['intercept'] + feature_names}) coef_trans = np.concatenate(([intercept], np.transpose(coefficients)), axis=0) else: summary = pd.DataFrame({'features': feature_names}) coef_trans = np.transpose(coefficients) if not is_binary: summary = pd.concat( (summary, pd.DataFrame(coef_trans, columns=classes)), axis=1) else: summary = pd.concat( (summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1) if is_binary: summary = pd.concat( (summary, pd.DataFrame(std_err, columns=['standard_error']), pd.DataFrame(wald, columns=['wald_statistic']), pd.DataFrame(p_values, columns=['p_value'])), axis=1) else: columns = [ 'standard_error_{}'.format(classes[i]) for i in range(len(classes)) ] summary = pd.concat( (summary, pd.DataFrame(std_err.T, columns=columns)), axis=1) arrange_col = ['features'] for i in range(len(classes)): arrange_col.append(classes[i]) arrange_col.append('standard_error_{}'.format(classes[i])) summary = summary[arrange_col] if is_binary: rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Logistic Regression Result | ### Summary | {table1} | | ##### Column '{small}' is the coefficients under the assumption ({small} = 0, {big} = 1). | | #### AIC : {aic} | | #### BIC : {bic} """.format(small=classes[0], big=classes[1], table1=pandasDF2MD(summary, num_rows=100), aic=aic, bic=bic))) else: rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Logistic Regression Result | ### Summary | {table1} | | ##### Each column whose name is one of classes of Label Column is the coefficients under the assumption it is 1 and others are 0. | | ##### For example, column '{small}' is the coefficients under the assumption ({small} = 1, others = 0). | | #### AIC : {aic} | | #### BIC : {bic} """.format(small=classes[0], table1=pandasDF2MD(summary, num_rows=100), aic=aic, bic=bic))) model = _model_dict('logistic_regression_model') model['standard_errors'] = std_err model['aic'] = aic model['bic'] = bic if is_binary: model['wald_statistics'] = wald model['p_values'] = p_values model['features'] = feature_cols model['label'] = label_col model['intercept'] = lr_model.intercept_ model['coefficients'] = lr_model.coef_ model['class'] = lr_model.classes_ model['penalty'] = penalty model['solver'] = solver model['lr_model'] = lr_model model['_repr_brtc_'] = rb.get() model['summary'] = summary return {'model': model}
def _ftest_for_stacked_data(table, response_cols, factor_col, alternatives, first=None, second=None, confi_level=0.95): if first is not None or second is not None: check_table = np.array(table[factor_col]) for element in check_table: if element is not None: if type(element) != str: if type(element) == bool: if first is not None and second is not None: first = bool(first) second = bool(second) break if first is not None: first = bool(first) break second = bool(second) break else: if first is not None and second is not None: first = float(first) second = float(second) break if first is not None: first = float(first) break second = float(second) break else: break if first is None or second is None: tmp_factors = np.unique(table[factor_col]) if len(tmp_factors) != 2: raise_error('0719', 'factor_col') if first is None: if tmp_factors[0] != second: first = tmp_factors[0] else: first = tmp_factors[1] if second is None: if tmp_factors[0] != first: second = tmp_factors[0] else: second = tmp_factors[1] table_first = table[table[factor_col] == first] table_second = table[table[factor_col] == second] tmp_table = [] number1 = len(table_first[factor_col]) number2 = len(table_second[factor_col]) d_num = number1 - 1 d_denum = number2 - 1 rb = BrtcReprBuilder() rb.addMD( strip_margin(""" ## F Test for Stacked Data Result | - Confidence level = {confi_level} | - Statistics = F statistic, F distribution with {d_num} numerator degrees of freedom and {d_denum} degrees of freedom under the null hypothesis """.format(confi_level=confi_level, d_num=d_num, d_denum=d_denum))) for response_col in response_cols: tmp_model = [] std1 = (table_first[response_col]).std() std2 = (table_second[response_col]).std() f_value = (std1**2) / (std2**2) if 'larger' in alternatives: p_value = scipy.stats.f.cdf(1 / f_value, d_num, d_denum) tmp_model += [ ['true ratio > 1'] + [p_value] + [(f_value / (scipy.stats.f.ppf(confi_level, d_num, d_denum)), math.inf)] ] tmp_table += [[ '%s by %s(%s,%s)' % (response_col, factor_col, first, second) ] + ['true ratio of variances > 1'] + [ 'F statistic, F distribution with %d numerator degrees of freedom and %d degrees of freedom under the null hypothesis.' % (d_num, d_denum) ] + [f_value] + [p_value] + [confi_level] + [ f_value / (scipy.stats.f.ppf(confi_level, d_num, d_denum)) ] + [math.inf]] if 'smaller' in alternatives: p_value = scipy.stats.f.cdf(f_value, d_num, d_denum) tmp_model += [['true ratio < 1'] + [p_value] + [(0.0, f_value * (scipy.stats.f.ppf(confi_level, d_denum, d_num)))]] tmp_table += [[ '%s by %s(%s,%s)' % (response_col, factor_col, first, second) ] + ['true ratio of variances < 1'] + [ 'F statistic, F distribution with %d numerator degrees of freedom and %d degrees of freedom under the null hypothesis.' % (d_num, d_denum) ] + [f_value] + [p_value] + [confi_level] + [0.0] + [ f_value * (scipy.stats.f.ppf(confi_level, d_denum, d_num)) ]] if 'two-sided' in alternatives: p_value_tmp = scipy.stats.f.cdf(1 / f_value, d_num, d_denum) if p_value_tmp > 0.5: p_value = (1 - p_value_tmp) * 2 else: p_value = p_value_tmp * 2 tmp_model += [ ['true ratio != 1'] + [p_value] + [(f_value / (scipy.stats.f.ppf( (1 + confi_level) / 2, d_num, d_denum)), f_value * (scipy.stats.f.ppf((1 + confi_level) / 2, d_denum, d_num)))] ] tmp_table += [[ '%s by %s(%s,%s)' % (response_col, factor_col, first, second) ] + ['true ratio of variances != 1'] + [ 'F statistic, F distribution with %d numerator degrees of freedom and %d degrees of freedom under the null hypothesis.' % (d_num, d_denum) ] + [f_value] + [p_value] + [confi_level] + [ f_value / (scipy.stats.f.ppf( (1 + confi_level) / 2, d_num, d_denum)) ] + [ f_value * (scipy.stats.f.ppf( (1 + confi_level) / 2, d_denum, d_num)) ]] result_model = pd.DataFrame.from_records(tmp_model) result_model.columns = [ 'alternative_hypothesis', 'p-value', '%g%% confidence interval' % (confi_level * 100) ] rb.addMD( strip_margin(""" | #### Data = {response_col} by {factor_col}({first},{second}) | - F-value = {f_value} | | {result_model} | """.format(response_col=response_col, factor_col=factor_col, first=first, second=second, f_value=f_value, result_model=pandasDF2MD(result_model)))) result = pd.DataFrame.from_records(tmp_table) result.columns = [ 'data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'lower_confidence_interval', 'upper_confidence_interval' ] model = dict() model['_repr_brtc_'] = rb.get() return {'out_table': result, 'model': model}
def _mlp_classification_train(table, feature_cols, label_col, hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001, batch_size_auto=True, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, max_iter=200, random_state=None, tol=0.0001): _, features = check_col_type(table, feature_cols) label = table[label_col] if (sklearn_utils.multiclass.type_of_target(label) == 'continuous'): raise_error('0718', 'label_col') mlp_model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, alpha=alpha, batch_size=batch_size, learning_rate=learning_rate, learning_rate_init=learning_rate_init, max_iter=max_iter, shuffle=True, random_state=random_state, tol=tol) mlp_model.fit(features, label) predict = mlp_model.predict(features) _accuracy_score = accuracy_score(label, predict) _f1_score = f1_score(label, predict, average='micro') _precision_score = precision_score(label, predict, average='micro') _recall_score = recall_score(label, predict, average='micro') # summary = pd.DataFrame({'features': feature_names}) # coef_trans = np.transpose(coefficients) # summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=classes)), axis=1) result_table = pd.DataFrame.from_items([[ 'Metric', ['Accuracy Score', 'F1 Score', 'Precision Score', 'Recall Score'] ], [ 'Score', [_accuracy_score, _f1_score, _precision_score, _recall_score] ]]) label_name = { 'hidden_layer_sizes': 'Hidden Layer Sizes', 'activation': 'Activation Function', 'solver': 'Solver', 'alpha': 'Alpha', 'batch_size': 'Batch Size', 'learning_rate': 'Learning Rate', 'learning_rate_init': 'Learning Rate Initial', 'max_iter': 'Max Iteration', 'random_state': 'Seed', 'tol': 'Tolerance' } get_param = mlp_model.get_params() param_table = pd.DataFrame.from_items( [['Parameter', list(label_name.values())], ['Value', [get_param[x] for x in list(label_name.keys())]]]) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ### MLP Classification Result | {result} | ### Parameters | {list_parameters} """.format(result=pandasDF2MD(result_table), list_parameters=pandasDF2MD(param_table)))) model = _model_dict('mlp_classification_model') model['features'] = feature_cols model['label'] = label_col model['intercepts'] = mlp_model.intercepts_ model['coefficients'] = mlp_model.coefs_ model['class'] = mlp_model.classes_ model['loss'] = mlp_model.loss_ model['accuracy_score'] = _accuracy_score model['f1_score'] = _f1_score model['precision_score'] = _precision_score model['recall_score'] = _recall_score model['activation'] = activation model['solver'] = solver model['alpha'] = alpha model['batch_size'] = batch_size model['learning_rate'] = learning_rate model['learning_rate_init'] = learning_rate_init model['max_iter'] = max_iter model['random_state'] = random_state model['tol'] = tol model['mlp_model'] = mlp_model model['_repr_brtc_'] = rb.get() # model['summary'] = summary return {'model': model}
def _logistic_regression_predict(table, model, prediction_col='prediction', prob_prefix='probability', output_log_prob=False, log_prob_prefix='log_probability', thresholds=None, suffix='index'): if (table.shape[0] == 0): new_cols = table.columns.tolist() + [prediction_col] classes = model['lr_model'].classes_ if suffix == 'index': prob_cols = [ prob_prefix + '_{}'.format(i) for i in range(len(classes)) ] else: prob_cols = [prob_prefix + '_{}'.format(i) for i in classes] if output_log_prob: if suffix == 'index': log_cols = [ log_prob_prefix + '_{}'.format(i) for i in range(len(classes)) ] else: log_cols = [log_prob_prefix + '_{}'.format(i) for i in classes] else: log_cols = [] new_cols += prob_cols + log_cols out_table = pd.DataFrame(columns=new_cols) return {'out_table': out_table} if 'features' in model: feature_cols = model['features'] else: feature_cols = model['feature_cols'] if 'lr_model' in model: feature_names, features = check_col_type(table, feature_cols) features = pd.DataFrame(features, columns=feature_names) else: features = table[feature_cols] if 'auto' in model and 'vs' not in model['_type']: if model['auto']: one_hot_input = model['table_4'][:-1][model['table_4']['data_type'] [:-1] == 'string'].index if len(one_hot_input != 0): features = one_hot_encoder( prefix='col_name', table=features, input_cols=features.columns[one_hot_input].tolist(), suffix='label')['out_table'] features = features[model['table_2']['features']] else: one_hot_input = model['table_3'][:-1][model['table_3']['data_type'] [:-1] == 'string'].index if len(one_hot_input != 0): features = one_hot_encoder( prefix='col_name', table=features, input_cols=features.columns[one_hot_input].tolist(), suffix='label')['out_table'] features = features[model['table_1']['features']] elif 'auto' in model and 'vs' in model['_type']: if model['auto']: one_hot_input = model['table_3'][:-1][model['table_3']['data_type'] [:-1] == 'string'].index if len(one_hot_input != 0): features = one_hot_encoder( prefix='col_name', table=features, input_cols=features.columns[one_hot_input].tolist(), suffix='label')['out_table'] features = features[model['table_2']['features']] else: one_hot_input = model['table_2'][:-1][model['table_2']['data_type'] [:-1] == 'string'].index if len(one_hot_input != 0): features = one_hot_encoder( prefix='col_name', table=features, input_cols=features.columns[one_hot_input].tolist(), suffix='label')['out_table'] features = features[model['table_1']['features']] if 'lr_model' in model: lr_model = model['lr_model'] classes = lr_model.classes_ len_classes = len(classes) is_binary = len_classes == 2 else: fit_intercept = model['fit_intercept'] if 'vs' not in model['_type']: len_classes = 2 is_binary = True if 'auto' in model: if model['auto']: classes = model['table_4']['labels'].values[-1] classes_type = model['table_4']['data_type'].values[-1] if classes_type == 'integer' or classes_type == 'long': classes = np.array([int(i) for i in classes]) elif classes_type == 'float' or classes_type == 'double': classes = np.array([float(i) for i in classes]) coefficients = model['table_3']['coefficients'][0] intercept = model['table_3']['intercept'][0] else: classes = model['table_3']['labels'].values[-1] classes_type = model['table_3']['data_type'].values[-1] if classes_type == 'integer' or classes_type == 'long': classes = np.array([int(i) for i in classes]) elif classes_type == 'float' or classes_type == 'double': classes = np.array([float(i) for i in classes]) coefficients = model['table_2']['coefficients'][0] intercept = model['table_2']['intercept'][0] else: classes = np.array([0, 1]) coefficients = model['table_2']['coefficient'][1:] if fit_intercept: intercept = model['table_2']['coefficient'][0] else: if 'auto' in model: if model['auto']: classes = np.array(model['table_3']['labels'].values[-1]) len_classes = len(classes) is_binary = len_classes == 2 intercept = model['table_2'].intercept coefficients = model['table_2'].coefficients else: classes = np.array(model['table_2']['labels'].values[-1]) len_classes = len(classes) is_binary = len_classes == 2 intercept = model['table_1'].intercept coefficients = model['table_1'].coefficients else: classes = np.array(model['table_1'].labelInfo) len_classes = len(classes) is_binary = len_classes == 2 intercept = model['table_1'].intercept coefficients = (model['table_1'][[ i for i in model['table_1'].columns if 'coefficient' in i ]]).values if thresholds is None: thresholds = np.array([1 / len_classes for _ in classes]) elif isinstance(thresholds, list): if len(thresholds) == 1 and is_binary and 0 < thresholds[0] < 1: thresholds = np.array([thresholds[0], 1 - thresholds[0]]) else: thresholds = np.array(thresholds) len_thresholds = len(thresholds) if len_classes > 0 and len_thresholds > 0 and len_classes != len_thresholds: # FN-0613='%s' must have length equal to the number of classes. raise_error('0613', ['thresholds']) if 'lr_model' in model: prob = lr_model.predict_proba(features) else: features = features.values coefficients = np.array(coefficients) if is_binary: tmp = features * coefficients if fit_intercept or 'auto' in model: prob = 1 / (np.exp(np.sum(tmp, axis=1) + intercept) + 1) else: prob = 1 / (np.exp(np.sum(tmp, axis=1)) + 1) prob = np.array([[x, 1 - x] for x in prob]) else: prob = [] for i in range(len(coefficients)): tmp = features * coefficients[i] if fit_intercept: prob.append( 1 / (np.exp(-np.sum(tmp, axis=1) - intercept[i]) + 1)) else: prob.append(1 / (np.exp(-np.sum(tmp, axis=1)) + 1)) prob = np.array(prob).T prob = np.apply_along_axis(lambda x: x / np.sum(x), 1, prob) prediction = classes[np.argmax(prob / thresholds, axis=1)] out_table = table.copy() out_table[prediction_col] = prediction if suffix == 'index': suffixes = [i for i, _ in enumerate(classes)] else: suffixes = classes prob_cols = [ '{probability_col}_{suffix}'.format(probability_col=prob_prefix, suffix=suffix) for suffix in suffixes ] prob_df = pd.DataFrame(data=prob, columns=prob_cols) if output_log_prob: log_prob = np.log(prob) logprob_cols = [ '{log_probability_col}_{suffix}'.format( log_probability_col=log_prob_prefix, suffix=suffix) for suffix in suffixes ] logprob_df = pd.DataFrame(data=log_prob, columns=logprob_cols) out_table = pd.concat([out_table, prob_df, logprob_df], axis=1) else: out_table = pd.concat([out_table, prob_df], axis=1) return {'out_table': out_table}
def _random_forest_classification_train(table, feature_cols, label_col, n_estimators=10, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0, max_features="sqrt", max_leaf_nodes=None, min_impurity_decrease=0, class_weight=None, random_state=None): feature_names, features_train = check_col_type(table, feature_cols) # X_train = table[feature_cols] y_train = table[label_col] if(type_of_target(y_train) == 'continuous'): raise_error('0718', 'label_col') if max_features == "n": max_features = None class_labels = y_train.unique() if class_weight is not None: if len(class_weight) != len(class_labels): raise ValueError("Number of class weights should match number of labels.") else: classes = sorted(class_labels) class_weight = {classes[i] : class_weight[i] for i in range(len(classes))} classifier = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, class_weight=class_weight, random_state=random_state) classifier.fit(features_train, y_train) params = {'feature_cols': feature_cols, 'label_col': label_col, 'n_estimators': n_estimators, 'criterion': criterion, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_features': max_features, 'max_leaf_nodes': max_leaf_nodes, 'min_impurity_decrease': min_impurity_decrease, 'class_weight': class_weight, 'random_state': random_state} model = _model_dict('random_forest_classification_model') model['classifier'] = classifier model['params'] = params fig_feature_importances = _plot_feature_importances(feature_names, classifier) rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Random Forest Classification Train Result | | ### Parameters | {params} | | ### Feature Importance | {fig_feature_importances} | """.format(params=dict2MD(params), fig_feature_importances=fig_feature_importances))) model['_repr_brtc_'] = rb.get() feature_importance = classifier.feature_importances_ feature_importance_table = pd.DataFrame([[feature_cols[i], feature_importance[i]] for i in range(len(feature_cols))], columns=['feature_name', 'importance']) model['feature_importance_table'] = feature_importance_table return {'model' : model}
def _two_sample_ttest_for_stacked_data(table, response_cols, factor_col, alternatives, first=None , second=None , hypo_diff=0, equal_vari='pooled', confi_level=0.95): if first is not None or second is not None: check_table = np.array(table[factor_col]) for element in check_table: if element is not None: if type(element) != str: if type(element) == bool: if first is not None and second is not None: first = bool(first) second = bool(second) break if first is not None: first = bool(first) break second = bool(second) break else: if first is not None and second is not None: first = float(first) second = float(second) break if first is not None: first = float(first) break second = float(second) break else: break if first is None or second is None: tmp_factors=np.unique(table[factor_col]) if len(tmp_factors) != 2: raise_error('0719', 'factor_col') if first is None: if tmp_factors[0] != second: first = tmp_factors[0] else: first = tmp_factors[1] if second is None: if tmp_factors[0] != first: second = tmp_factors[0] else: second = tmp_factors[1] table_first = table[table[factor_col] == first] table_second = table[table[factor_col] == second] tmp_table = [] rb = BrtcReprBuilder() rb.addMD(strip_margin(""" ## Two Sample T Test for Stacked Data Result | - Hypothesized mean = {hypo_diff} | - Confidence level = {confi_level} """.format(hypo_diff=hypo_diff, confi_level=confi_level))) for response_col in response_cols: tmp_model = [] number1 = len(table_first[response_col]) number2 = len(table_second[response_col]) mean1 = (table_first[response_col]).mean() mean2 = (table_second[response_col]).mean() std1 = (table_first[response_col]).std() std2 = (table_second[response_col]).std() start_auto = 0 if equal_vari == 'auto': start_auto = 1 f_value = (std1 ** 2) / (std2 ** 2) f_test_p_value_tmp = stats.f.cdf(1 / f_value, number1 - 1, number2 - 1) if f_test_p_value_tmp > 0.5: f_test_p_value = (1 - f_test_p_value_tmp) * 2 else: f_test_p_value = f_test_p_value_tmp * 2 if f_test_p_value < 0.05: equal_vari = 'unequal' else: equal_vari = 'pooled' ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'larger', usevar=equal_vari, value=hypo_diff) if 'larger' in alternatives: ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'larger', usevar=equal_vari, value=hypo_diff) df = ttestresult[2] if equal_vari == 'pooled': std_number1number2 = sqrt(((number1 - 1) * (std1) ** 2 + (number2 - 1) * (std2) ** 2) / (number1 + number2 - 2)) margin = t.ppf((confi_level) , df) * std_number1number2 * sqrt(1 / number1 + 1 / number2) if equal_vari == 'unequal': margin = t.ppf((confi_level) , df) * sqrt(std1 ** 2 / (number1) + std2 ** 2 / (number2)) tmp_model += [['true difference in means > {}'.format(hypo_diff)] + [ttestresult[1]] + [(mean1 - mean2 - margin, math.inf)]] tmp_table += [['%s by %s(%s,%s)' % (response_col, factor_col, first, second)] + ['true difference in means > {}'.format(hypo_diff)] + ['t statistic, t distribution with %f degrees of freedom under the null hypothesis' % ttestresult[2]] + [ttestresult[0]] + [ttestresult[1]] + [confi_level] + [mean1 - mean2 - margin] + [math.inf]] if 'smaller' in alternatives: ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'smaller', usevar=equal_vari, value=hypo_diff) df = ttestresult[2] if equal_vari == 'pooled': std_number1number2 = sqrt(((number1 - 1) * (std1) ** 2 + (number2 - 1) * (std2) ** 2) / (number1 + number2 - 2)) margin = t.ppf((confi_level) , df) * std_number1number2 * sqrt(1 / number1 + 1 / number2) if equal_vari == 'unequal': margin = t.ppf((confi_level) , df) * sqrt(std1 ** 2 / (number1) + std2 ** 2 / (number2)) tmp_model += [['true difference in means < {}'.format(hypo_diff)] + [ttestresult[1]] + [(-math.inf, mean1 - mean2 + margin)]] tmp_table += [['%s by %s(%s,%s)' % (response_col, factor_col, first, second)] + ['true difference in means < {}'.format(hypo_diff)] + ['t statistic, t distribution with %f degrees of freedom under the null hypothesis' % ttestresult[2]] + [ttestresult[0]] + [ttestresult[1]] + [confi_level] + [-math.inf] + [mean1 - mean2 + margin]] if 'two-sided' in alternatives: ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'two-sided', usevar=equal_vari, value=hypo_diff) df = ttestresult[2] if equal_vari == 'pooled': std_number1number2 = sqrt(((number1 - 1) * (std1) ** 2 + (number2 - 1) * (std2) ** 2) / (number1 + number2 - 2)) margin = t.ppf((confi_level + 1) / 2 , df) * std_number1number2 * sqrt(1 / number1 + 1 / number2) if equal_vari == 'unequal': margin = t.ppf((confi_level + 1) / 2 , df) * sqrt(std1 ** 2 / (number1) + std2 ** 2 / (number2)) tmp_model += [['true difference in means != {}'.format(hypo_diff)] + [ttestresult[1]] + [(mean1 - mean2 - margin, mean1 - mean2 + margin)]] tmp_table += [['%s by %s(%s,%s)' % (response_col, factor_col, first, second)] + ['true difference in means != {}'.format(hypo_diff)] + ['t statistic, t distribution with %f degrees of freedom under the null hypothesis' % ttestresult[2]] + [ttestresult[0]] + [ttestresult[1]] + [confi_level] + [mean1 - mean2 - margin] + [mean1 - mean2 + margin]] result_model = pd.DataFrame.from_records(tmp_model) result_model.columns = ['alternative hypothesis', 'p-value', '%g%% confidence interval' % (confi_level * 100)] rb.addMD(strip_margin(""" | #### Data = {response_col} by {factor_col}({first},{second}) | - Statistics = t statistic, t distribution with {ttestresult2} degrees of freedom under the null hypothesis | - t-value = {ttestresult0} | | {result_model} | """.format(ttestresult2=ttestresult[2], response_col=response_col, factor_col=factor_col, first=first, second=second, ttestresult0=ttestresult[0], result_model=pandasDF2MD(result_model)))) if start_auto == 1: equal_vari = 'auto' result = pd.DataFrame.from_records(tmp_table) result.columns = ['data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'lower_confidence_interval', 'upper_confidence_interval'] model = dict() model['_repr_brtc_'] = rb.get() return {'out_table' : result, 'model' : model}
def _random_forest_classification_train(table, feature_cols, label_col, n_estimators=10, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0, max_features="sqrt", max_leaf_nodes=None, min_impurity_decrease=0, random_state=None): X_train = table[feature_cols] y_train = table[label_col] if (sklearn_utils.multiclass.type_of_target(y_train) == 'continuous'): raise_error('0718', 'label_col') if max_features == "None": max_features = None classifier = RandomForestClassifier( n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, random_state=random_state) classifier.fit(X_train, y_train) params = { 'feature_cols': feature_cols, 'label_col': label_col, 'n_estimators': n_estimators, 'criterion': criterion, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_features': max_features, 'max_leaf_nodes': max_leaf_nodes, 'min_impurity_decrease': min_impurity_decrease, 'random_state': random_state } model = dict() model['classifier'] = classifier model['params'] = params fig_feature_importances = _plot_feature_importances( feature_cols, classifier) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Random Forest Classification Train Result | | ### Feature Importance | {fig_feature_importances} | """.format(fig_feature_importances=fig_feature_importances))) model['_repr_brtc_'] = rb.get() return {'model': model}