def _knn_classification(train_table,
                        test_table,
                        feature_cols,
                        label_col,
                        k=5,
                        algorithm='auto',
                        leaf_size=30,
                        p=2,
                        pred_col_name='prediction',
                        prob_col_prefix='probability',
                        suffix='index'):

    _, X_train = check_col_type(train_table, feature_cols)
    y_train = train_table[label_col]
    _, X_test = check_col_type(test_table, feature_cols)

    if (sklearn_utils.multiclass.type_of_target(y_train) == 'continuous'):
        raise_error('0718', 'label_col')

    knn = KNeighborsClassifier(n_neighbors=k,
                               algorithm=algorithm,
                               leaf_size=leaf_size,
                               p=p)

    # Predict the class labels for the provided data
    knn.fit(X_train, y_train)
    classes = knn.classes_
    if (test_table.shape[0] == 0):
        new_cols = test_table.columns.tolist() + [pred_col_name]
        if suffix == 'index':
            prob_cols = [
                prob_col_prefix + '_{}'.format(i) for i in range(len(classes))
            ]
        else:
            prob_cols = [prob_col_prefix + '_{}'.format(i) for i in classes]
        new_cols += prob_cols
        out_table = pd.DataFrame(columns=new_cols)
        return {'out_table': out_table}
    pred = knn.predict(X_test)
    out_col_pred = pd.DataFrame(pred, columns=[pred_col_name])

    if suffix == 'index':
        suffixes = [i for i, _ in enumerate(classes)]
    else:
        suffixes = classes

    # Return probability estimates for the test data
    prob = knn.predict_proba(X_test)
    prob_col_name = [
        '{prob_col_prefix}_{suffix}'.format(prob_col_prefix=prob_col_prefix,
                                            suffix=suffix)
        for suffix in suffixes
    ]
    out_col_prob = pd.DataFrame(data=prob, columns=prob_col_name)

    # Result
    out_table = pd.concat(
        [test_table.reset_index(drop=True), out_col_pred, out_col_prob],
        axis=1)
    return {'out_table': out_table}
def _logistic_regression_predict(table,
                                 model,
                                 prediction_col='prediction',
                                 prob_prefix='probability',
                                 output_log_prob=False,
                                 log_prob_prefix='log_probability',
                                 thresholds=None,
                                 suffix='index'):
    feature_cols = model['features']
    feature_names, features = check_col_type(table, feature_cols)
    lr_model = model['lr_model']
    classes = lr_model.classes_
    len_classes = len(classes)
    is_binary = len_classes == 2

    if thresholds is None:
        thresholds = np.array([1 / len_classes for _ in classes])
    elif isinstance(thresholds, list):
        if len(thresholds) == 1 and is_binary and 0 < thresholds[0] < 1:
            thresholds = np.array([thresholds[0], 1 - thresholds[0]])
        else:
            thresholds = np.array(thresholds)

    len_thresholds = len(thresholds)
    if len_classes > 0 and len_thresholds > 0 and len_classes != len_thresholds:
        # FN-0613='%s' must have length equal to the number of classes.
        raise_error('0613', ['thresholds'])

    prob = lr_model.predict_proba(features)
    prediction = pd.DataFrame(prob).apply(
        lambda x: classes[np.argmax(x / thresholds)], axis=1)

    out_table = table.copy()
    out_table[prediction_col] = prediction

    if suffix == 'index':
        suffixes = [i for i, _ in enumerate(classes)]
    else:
        suffixes = classes

    prob_cols = [
        '{probability_col}_{suffix}'.format(probability_col=prob_prefix,
                                            suffix=suffix)
        for suffix in suffixes
    ]
    prob_df = pd.DataFrame(data=prob, columns=prob_cols)

    if output_log_prob:
        log_prob = lr_model.predict_log_proba(features)
        logprob_cols = [
            '{log_probability_col}_{suffix}'.format(
                log_probability_col=log_prob_prefix, suffix=suffix)
            for suffix in suffixes
        ]
        logprob_df = pd.DataFrame(data=log_prob, columns=logprob_cols)
        out_table = pd.concat([out_table, prob_df, logprob_df], axis=1)
    else:
        out_table = pd.concat([out_table, prob_df], axis=1)

    return {'out_table': out_table}
Exemple #3
0
def _logistic_regression_train(table, feature_cols, label_col, penalty='l2', dual=False, tol=0.0001, C=1.0,
                               fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None,
                               solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False,
                               n_jobs=1):

    features = table[feature_cols]
    label = table[label_col]

    if(sklearn_utils.multiclass.type_of_target(label) == 'continuous'):
        raise_error('0718', 'label_col')
    
    lr_model = LogisticRegression(penalty, dual, tol, C, fit_intercept, intercept_scaling, class_weight, random_state,
                                  solver, max_iter, multi_class, verbose, warm_start, n_jobs)
    lr_model.fit(features, label)

    intercept = lr_model.intercept_
    coefficients = lr_model.coef_
    classes = lr_model.classes_
    is_binary = len(classes) == 2

    if (fit_intercept == True):
        summary = pd.DataFrame({'features': ['intercept'] + feature_cols})
        print(intercept)
        print(coefficients)
        
        coef_trans = np.concatenate(([intercept], np.transpose(coefficients)), axis=0)
        if not is_binary:
            summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=classes)), axis=1)
        elif is_binary:
            summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1)
            
    else:
        summary = pd.DataFrame({'features': feature_cols})
        coef_trans = np.transpose(coefficients)
        
        if not is_binary:
            summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=classes)), axis=1)
        elif is_binary:
            summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1)
        
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Logistic Regression Result
    | ### Summary
    | {table1}
    """.format(table1=pandasDF2MD(summary)
               )))

    model = _model_dict('logistic_regression_model')
    model['features'] = feature_cols
    model['label'] = label_col
    model['intercept'] = lr_model.intercept_
    model['coefficients'] = lr_model.coef_
    model['class'] = lr_model.classes_
    model['penalty'] = penalty
    model['solver'] = solver
    model['lr_model'] = lr_model
    model['_repr_brtc_'] = rb.get()

    return {'model' : model}
Exemple #4
0
def _under_sampling(table, label_col, sampling_strategy='not majority', seed=None, estimator='KMeans',
                    n_clusters=8, voting='auto', n_jobs=1):

    # Separate features and label
    features = table.drop([label_col], axis=1)
    y = table[label_col]

    if(sklearn_utils.multiclass.type_of_target(y) == 'continuous'):
        raise_error('0718', 'label_col')
    
    # Initialization label encoder
    lab_encoder = preprocessing.LabelEncoder()

    # Filter out categorical columns in features
    categorical_cols = [col for col in features.columns if features[col].dtypes == 'object']

    # Transform categorical columns and add to the original features
    for cate_col in categorical_cols:
        features_encoder = lab_encoder.fit_transform(features[cate_col])
        features[cate_col] = features_encoder
    
    # Transform label column with object type
    if (y.dtypes == 'object'):
        y_encoder = lab_encoder.fit_transform(y)
    else:
        y_encoder = y

    if (estimator == 'Kmeans'):
        estimator_model = KMeans(n_clusters=n_clusters)
    else:
        estimator_model = None
    
    # Process under sampling
    sm = ClusterCentroids(sampling_strategy=sampling_strategy, random_state=seed, 
                    estimator=estimator_model, voting=voting, n_jobs=n_jobs)
    
    X_res, y_res = sm.fit_resample(features, y_encoder)

    # Invert to original data
    if (y.dtypes == 'object'):
        y_decoder = lab_encoder.inverse_transform(y_res)
    else:    
        y_decoder = y_res

    df = pd.DataFrame(data=X_res, columns=features.columns)

    for cate_col in categorical_cols:
        df[cate_col] = lab_encoder.inverse_transform(df[cate_col].astype('int32'))

    df1 = pd.DataFrame(data=y_decoder, columns=[label_col])

    # Output result
    out_table = df.join(df1)

    return {'out_table' : out_table}
def check_col_type(table, feature_cols):
    test_table = table[feature_cols]
    if (check_list(test_table)):
        test_table = table[feature_cols[0]].tolist()
        feature_names = [
            feature_cols[0] + '_{}'.format(i)
            for i in range(len(test_table[0]))
        ]
        return feature_names, test_table
    elif (check_all_numbers(test_table)):
        return feature_cols, test_table
    else:
        raise_error('0720', 'feature_cols')
Exemple #6
0
def _SMOTE(table,
           label_col,
           sampling_strategy='not majority',
           seed=None,
           k_neighbors=5,
           m_neighbors=10,
           out_step=0.5,
           kind='regular',
           svm_estimator='svc',
           n_jobs=1):

    features = table.drop([label_col], axis=1)
    y = table[label_col]

    if (sklearn_utils.multiclass.type_of_target(y) == 'continuous'):
        raise_error('0718', 'label_col')

    lab_encoder = preprocessing.LabelEncoder()
    y_encoder = lab_encoder.fit_transform(y)

    if (kind == 'svm'):
        svc_model = svm.SVC()
    else:
        svc_model = None

    sm = SMOTE_LIB(sampling_strategy=sampling_strategy,
                   random_state=seed,
                   k_neighbors=k_neighbors,
                   m_neighbors=m_neighbors,
                   out_step=out_step,
                   kind=kind,
                   svm_estimator=svc_model,
                   n_jobs=n_jobs)

    X_res, y_res = sm.fit_resample(features, y_encoder)
    y_decoder = lab_encoder.inverse_transform(y_res)

    df = pd.DataFrame(data=X_res, columns=features.columns)
    df1 = pd.DataFrame(data=y_decoder, columns=[label_col])

    out_table = df.join(df1)

    return {'out_table': out_table}
def _decision_tree_classification_train(
        table,
        feature_cols,
        label_col,  # fig_size=np.array([6.4, 4.8]),
        criterion='gini',
        splitter='best',
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        max_features=None,
        random_state=None,
        max_leaf_nodes=None,
        min_impurity_decrease=0.0,
        min_impurity_split=None,
        class_weight=None,
        presort=False,
        sample_weight=None,
        check_input=True,
        X_idx_sorted=None):

    y_train = table[label_col]

    if (sklearn_utils.multiclass.type_of_target(y_train) == 'continuous'):
        raise_error('0718', 'label_col')

    classifier = DecisionTreeClassifier(
        criterion, splitter, max_depth, min_samples_split, min_samples_leaf,
        min_weight_fraction_leaf, max_features, random_state, max_leaf_nodes,
        min_impurity_decrease, min_impurity_split, class_weight, presort)
    classifier.fit(table[feature_cols], table[label_col], sample_weight,
                   check_input, X_idx_sorted)

    try:
        from sklearn.externals.six import StringIO
        from sklearn.tree import export_graphviz
        import pydotplus
        dot_data = StringIO()
        export_graphviz(classifier,
                        out_file=dot_data,
                        feature_names=feature_cols,
                        class_names=table[label_col].astype('str').unique(),
                        filled=True,
                        rounded=True,
                        special_characters=True)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        from brightics.common.repr import png2MD
        fig_tree = png2MD(graph.create_png())
    except:
        fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer."

    # json
    model = _model_dict('decision_tree_classification_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    model['classes'] = classifier.classes_
    feature_importance = classifier.feature_importances_
    model['feature_importance'] = feature_importance
    model['max_features'] = classifier.max_features_
    model['n_classes'] = classifier.n_classes_
    model['n_features'] = classifier.n_features_
    model['n_outputs'] = classifier.n_outputs_
    model['tree'] = classifier.tree_
    get_param = classifier.get_params()
    model['parameters'] = get_param
    model['classifier'] = classifier

    # report
    indices = np.argsort(feature_importance)
    sorted_feature_cols = np.array(feature_cols)[indices]

    plt.title('Feature Importances')
    plt.barh(range(len(indices)),
             feature_importance[indices],
             color='b',
             align='center')
    for i, v in enumerate(feature_importance[indices]):
        plt.text(v,
                 i,
                 " {:.2f}".format(v),
                 color='b',
                 va='center',
                 fontweight='bold')
    plt.yticks(range(len(indices)), sorted_feature_cols)
    plt.xlabel('Relative Importance')
    plt.xlim(0, 1.1)
    plt.tight_layout()
    fig_feature_importances = plt2MD(plt)
    plt.clf()

    params = dict2MD(get_param)

    # Add tree plot
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Decision Tree Classification Train Result
    | ### Decision Tree
    | {fig_tree}
    |
    | ### Feature Importance
    | {fig_feature_importances}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_tree=fig_tree,
               fig_feature_importances=fig_feature_importances,
               list_parameters=params)))
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Exemple #8
0
def raise_error(error_code, error_message_params, true_condition=False):
    common_validation.raise_error(error_code, error_message_params, true_condition)
def _logistic_regression_train(table,
                               feature_cols,
                               label_col,
                               penalty='l2',
                               dual=False,
                               tol=0.0001,
                               C=1.0,
                               fit_intercept=True,
                               intercept_scaling=1,
                               class_weight=None,
                               random_state=None,
                               solver='liblinear',
                               max_iter=100,
                               multi_class='ovr',
                               verbose=0,
                               warm_start=False,
                               n_jobs=1):

    feature_names, features = check_col_type(table, feature_cols)
    label = table[label_col]

    if (sklearn_utils.multiclass.type_of_target(label) == 'continuous'):
        raise_error('0718', 'label_col')

    lr_model = LogisticRegression(penalty, dual, tol, C, fit_intercept,
                                  intercept_scaling, class_weight,
                                  random_state, solver, max_iter, multi_class,
                                  verbose, warm_start, n_jobs)
    lr_model.fit(features, label)
    new_features = pd.DataFrame({
        "Constant": np.ones(len(features))
    }).join(pd.DataFrame(features))
    intercept = lr_model.intercept_
    coefficients = lr_model.coef_
    classes = lr_model.classes_
    is_binary = len(classes) == 2
    prob = lr_model.predict_proba(features)
    prob_trans = prob.T
    classes_dict = dict()
    for i in range(len(classes)):
        classes_dict[classes[i]] = i
    tmp_label = np.array([classes_dict[i] for i in label])
    likelihood = 1
    for i in range(len(table)):
        likelihood *= prob_trans[tmp_label[i]][i]
    if fit_intercept:
        k = len(feature_cols) + 1
    else:
        k = len(feature_cols)
    aic = 2 * k - 2 * np.log(likelihood)
    bic = np.log(len(table)) * k - 2 * np.log(likelihood)
    if is_binary:
        if fit_intercept:
            x_design = np.hstack([np.ones((features.shape[0], 1)), features])
        else:
            x_design = features.values
        v = np.product(prob, axis=1)
        x_design_modi = np.array(
            [x_design[i] * v[i] for i in range(len(x_design))])
        cov_logit = np.linalg.inv(np.dot(x_design_modi.T, x_design))
        std_err = np.sqrt(np.diag(cov_logit))
        if fit_intercept:
            logit_params = np.insert(coefficients, 0, intercept)
        else:
            logit_params = coefficients
        wald = (logit_params / std_err)**2
        p_values = 1 - chi2.cdf(wald, 1)
    else:
        if fit_intercept:
            x_design = np.hstack([np.ones((features.shape[0], 1)), features])
        else:
            x_design = features.values
        std_err = []
        for i in range(len(classes)):
            v = prob.T[i] * (1 - prob.T[i])
            x_design_modi = np.array(
                [x_design[i] * v[i] for i in range(len(x_design))])
            cov_logit = np.linalg.inv(np.dot(x_design_modi.T, x_design))
            std_err.append(np.sqrt(np.diag(cov_logit)))
        std_err = np.array(std_err)

        #print(math.log(likelihood))

    if (fit_intercept == True):
        summary = pd.DataFrame({'features': ['intercept'] + feature_names})
        coef_trans = np.concatenate(([intercept], np.transpose(coefficients)),
                                    axis=0)

    else:
        summary = pd.DataFrame({'features': feature_names})
        coef_trans = np.transpose(coefficients)

    if not is_binary:
        summary = pd.concat(
            (summary, pd.DataFrame(coef_trans, columns=classes)), axis=1)
    else:
        summary = pd.concat(
            (summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1)
    if is_binary:
        summary = pd.concat(
            (summary, pd.DataFrame(std_err, columns=['standard_error']),
             pd.DataFrame(wald, columns=['wald_statistic']),
             pd.DataFrame(p_values, columns=['p_value'])),
            axis=1)
    else:
        columns = [
            'standard_error_{}'.format(classes[i]) for i in range(len(classes))
        ]
        summary = pd.concat(
            (summary, pd.DataFrame(std_err.T, columns=columns)), axis=1)
        arrange_col = ['features']
        for i in range(len(classes)):
            arrange_col.append(classes[i])
            arrange_col.append('standard_error_{}'.format(classes[i]))
        summary = summary[arrange_col]
    if is_binary:
        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""
        | ## Logistic Regression Result
        | ### Summary
        | {table1}
        |
        | ##### Column '{small}' is the coefficients under the assumption ({small} = 0, {big} = 1).
        |
        | #### AIC : {aic}
        |
        | #### BIC : {bic}
        """.format(small=classes[0],
                   big=classes[1],
                   table1=pandasDF2MD(summary, num_rows=100),
                   aic=aic,
                   bic=bic)))
    else:
        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""
        | ## Logistic Regression Result
        | ### Summary
        | {table1}
        |
        | ##### Each column whose name is one of classes of Label Column is the coefficients under the assumption it is 1 and others are 0.
        |
        | ##### For example, column '{small}' is the coefficients under the assumption ({small} = 1, others = 0).
        |
        | #### AIC : {aic}
        |
        | #### BIC : {bic}
        """.format(small=classes[0],
                   table1=pandasDF2MD(summary, num_rows=100),
                   aic=aic,
                   bic=bic)))

    model = _model_dict('logistic_regression_model')
    model['standard_errors'] = std_err
    model['aic'] = aic
    model['bic'] = bic
    if is_binary:
        model['wald_statistics'] = wald
        model['p_values'] = p_values
    model['features'] = feature_cols
    model['label'] = label_col
    model['intercept'] = lr_model.intercept_
    model['coefficients'] = lr_model.coef_
    model['class'] = lr_model.classes_
    model['penalty'] = penalty
    model['solver'] = solver
    model['lr_model'] = lr_model
    model['_repr_brtc_'] = rb.get()
    model['summary'] = summary
    return {'model': model}
Exemple #10
0
def _ftest_for_stacked_data(table,
                            response_cols,
                            factor_col,
                            alternatives,
                            first=None,
                            second=None,
                            confi_level=0.95):

    if first is not None or second is not None:
        check_table = np.array(table[factor_col])
        for element in check_table:
            if element is not None:
                if type(element) != str:
                    if type(element) == bool:
                        if first is not None and second is not None:
                            first = bool(first)
                            second = bool(second)
                            break
                        if first is not None:
                            first = bool(first)
                            break
                        second = bool(second)
                        break
                    else:
                        if first is not None and second is not None:
                            first = float(first)
                            second = float(second)
                            break
                        if first is not None:
                            first = float(first)
                            break
                        second = float(second)
                        break
                else:
                    break
    if first is None or second is None:
        tmp_factors = np.unique(table[factor_col])
        if len(tmp_factors) != 2:
            raise_error('0719', 'factor_col')
    if first is None:
        if tmp_factors[0] != second:
            first = tmp_factors[0]
        else:
            first = tmp_factors[1]
    if second is None:
        if tmp_factors[0] != first:
            second = tmp_factors[0]
        else:
            second = tmp_factors[1]
    table_first = table[table[factor_col] == first]
    table_second = table[table[factor_col] == second]
    tmp_table = []
    number1 = len(table_first[factor_col])
    number2 = len(table_second[factor_col])
    d_num = number1 - 1
    d_denum = number2 - 1
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    ## F Test for Stacked Data Result
    | - Confidence level = {confi_level}
    | - Statistics = F statistic, F distribution with {d_num} numerator degrees of freedom and {d_denum} degrees of freedom under the null hypothesis
    """.format(confi_level=confi_level, d_num=d_num, d_denum=d_denum)))

    for response_col in response_cols:
        tmp_model = []
        std1 = (table_first[response_col]).std()
        std2 = (table_second[response_col]).std()
        f_value = (std1**2) / (std2**2)

        if 'larger' in alternatives:
            p_value = scipy.stats.f.cdf(1 / f_value, d_num, d_denum)
            tmp_model += [
                ['true ratio > 1'] + [p_value] +
                [(f_value /
                  (scipy.stats.f.ppf(confi_level, d_num, d_denum)), math.inf)]
            ]
            tmp_table += [[
                '%s by %s(%s,%s)' % (response_col, factor_col, first, second)
            ] + ['true ratio of variances > 1'] + [
                'F statistic, F distribution with %d numerator degrees of freedom and %d degrees of freedom under the null hypothesis.'
                % (d_num, d_denum)
            ] + [f_value] + [p_value] + [confi_level] + [
                f_value / (scipy.stats.f.ppf(confi_level, d_num, d_denum))
            ] + [math.inf]]

        if 'smaller' in alternatives:
            p_value = scipy.stats.f.cdf(f_value, d_num, d_denum)
            tmp_model += [['true ratio < 1'] + [p_value] +
                          [(0.0, f_value *
                            (scipy.stats.f.ppf(confi_level, d_denum, d_num)))]]
            tmp_table += [[
                '%s by %s(%s,%s)' % (response_col, factor_col, first, second)
            ] + ['true ratio of variances < 1'] + [
                'F statistic, F distribution with %d numerator degrees of freedom and %d degrees of freedom under the null hypothesis.'
                % (d_num, d_denum)
            ] + [f_value] + [p_value] + [confi_level] + [0.0] + [
                f_value * (scipy.stats.f.ppf(confi_level, d_denum, d_num))
            ]]

        if 'two-sided' in alternatives:
            p_value_tmp = scipy.stats.f.cdf(1 / f_value, d_num, d_denum)
            if p_value_tmp > 0.5:
                p_value = (1 - p_value_tmp) * 2
            else:
                p_value = p_value_tmp * 2
            tmp_model += [
                ['true ratio != 1'] + [p_value] +
                [(f_value / (scipy.stats.f.ppf(
                    (1 + confi_level) / 2, d_num, d_denum)), f_value *
                  (scipy.stats.f.ppf((1 + confi_level) / 2, d_denum, d_num)))]
            ]
            tmp_table += [[
                '%s by %s(%s,%s)' % (response_col, factor_col, first, second)
            ] + ['true ratio of variances != 1'] + [
                'F statistic, F distribution with %d numerator degrees of freedom and %d degrees of freedom under the null hypothesis.'
                % (d_num, d_denum)
            ] + [f_value] + [p_value] + [confi_level] + [
                f_value / (scipy.stats.f.ppf(
                    (1 + confi_level) / 2, d_num, d_denum))
            ] + [
                f_value * (scipy.stats.f.ppf(
                    (1 + confi_level) / 2, d_denum, d_num))
            ]]

        result_model = pd.DataFrame.from_records(tmp_model)
        result_model.columns = [
            'alternative_hypothesis', 'p-value',
            '%g%% confidence interval' % (confi_level * 100)
        ]
        rb.addMD(
            strip_margin("""
        | #### Data = {response_col} by {factor_col}({first},{second})
        | - F-value = {f_value}
        |
        | {result_model}
        |
        """.format(response_col=response_col,
                   factor_col=factor_col,
                   first=first,
                   second=second,
                   f_value=f_value,
                   result_model=pandasDF2MD(result_model))))

    result = pd.DataFrame.from_records(tmp_table)
    result.columns = [
        'data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value',
        'confidence_level', 'lower_confidence_interval',
        'upper_confidence_interval'
    ]

    model = dict()
    model['_repr_brtc_'] = rb.get()
    return {'out_table': result, 'model': model}
Exemple #11
0
def _mlp_classification_train(table,
                              feature_cols,
                              label_col,
                              hidden_layer_sizes=(100, ),
                              activation='relu',
                              solver='adam',
                              alpha=0.0001,
                              batch_size_auto=True,
                              batch_size='auto',
                              learning_rate='constant',
                              learning_rate_init=0.001,
                              max_iter=200,
                              random_state=None,
                              tol=0.0001):

    _, features = check_col_type(table, feature_cols)
    label = table[label_col]

    if (sklearn_utils.multiclass.type_of_target(label) == 'continuous'):
        raise_error('0718', 'label_col')

    mlp_model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes,
                              activation=activation,
                              solver=solver,
                              alpha=alpha,
                              batch_size=batch_size,
                              learning_rate=learning_rate,
                              learning_rate_init=learning_rate_init,
                              max_iter=max_iter,
                              shuffle=True,
                              random_state=random_state,
                              tol=tol)
    mlp_model.fit(features, label)

    predict = mlp_model.predict(features)

    _accuracy_score = accuracy_score(label, predict)
    _f1_score = f1_score(label, predict, average='micro')
    _precision_score = precision_score(label, predict, average='micro')
    _recall_score = recall_score(label, predict, average='micro')

    # summary = pd.DataFrame({'features': feature_names})
    # coef_trans = np.transpose(coefficients)

    # summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=classes)), axis=1)

    result_table = pd.DataFrame.from_items([[
        'Metric',
        ['Accuracy Score', 'F1 Score', 'Precision Score', 'Recall Score']
    ], [
        'Score', [_accuracy_score, _f1_score, _precision_score, _recall_score]
    ]])

    label_name = {
        'hidden_layer_sizes': 'Hidden Layer Sizes',
        'activation': 'Activation Function',
        'solver': 'Solver',
        'alpha': 'Alpha',
        'batch_size': 'Batch Size',
        'learning_rate': 'Learning Rate',
        'learning_rate_init': 'Learning Rate Initial',
        'max_iter': 'Max Iteration',
        'random_state': 'Seed',
        'tol': 'Tolerance'
    }
    get_param = mlp_model.get_params()
    param_table = pd.DataFrame.from_items(
        [['Parameter', list(label_name.values())],
         ['Value', [get_param[x] for x in list(label_name.keys())]]])

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ### MLP Classification Result
    | {result}
    | ### Parameters
    | {list_parameters}
    """.format(result=pandasDF2MD(result_table),
               list_parameters=pandasDF2MD(param_table))))

    model = _model_dict('mlp_classification_model')
    model['features'] = feature_cols
    model['label'] = label_col
    model['intercepts'] = mlp_model.intercepts_
    model['coefficients'] = mlp_model.coefs_
    model['class'] = mlp_model.classes_
    model['loss'] = mlp_model.loss_
    model['accuracy_score'] = _accuracy_score
    model['f1_score'] = _f1_score
    model['precision_score'] = _precision_score
    model['recall_score'] = _recall_score
    model['activation'] = activation
    model['solver'] = solver
    model['alpha'] = alpha
    model['batch_size'] = batch_size
    model['learning_rate'] = learning_rate
    model['learning_rate_init'] = learning_rate_init
    model['max_iter'] = max_iter
    model['random_state'] = random_state
    model['tol'] = tol
    model['mlp_model'] = mlp_model
    model['_repr_brtc_'] = rb.get()
    # model['summary'] = summary

    return {'model': model}
Exemple #12
0
def _logistic_regression_predict(table,
                                 model,
                                 prediction_col='prediction',
                                 prob_prefix='probability',
                                 output_log_prob=False,
                                 log_prob_prefix='log_probability',
                                 thresholds=None,
                                 suffix='index'):
    if (table.shape[0] == 0):
        new_cols = table.columns.tolist() + [prediction_col]
        classes = model['lr_model'].classes_
        if suffix == 'index':
            prob_cols = [
                prob_prefix + '_{}'.format(i) for i in range(len(classes))
            ]
        else:
            prob_cols = [prob_prefix + '_{}'.format(i) for i in classes]
        if output_log_prob:
            if suffix == 'index':
                log_cols = [
                    log_prob_prefix + '_{}'.format(i)
                    for i in range(len(classes))
                ]
            else:
                log_cols = [log_prob_prefix + '_{}'.format(i) for i in classes]
        else:
            log_cols = []
        new_cols += prob_cols + log_cols
        out_table = pd.DataFrame(columns=new_cols)
        return {'out_table': out_table}
    if 'features' in model:
        feature_cols = model['features']
    else:
        feature_cols = model['feature_cols']
    if 'lr_model' in model:
        feature_names, features = check_col_type(table, feature_cols)
        features = pd.DataFrame(features, columns=feature_names)
    else:
        features = table[feature_cols]
    if 'auto' in model and 'vs' not in model['_type']:
        if model['auto']:
            one_hot_input = model['table_4'][:-1][model['table_4']['data_type']
                                                  [:-1] == 'string'].index
            if len(one_hot_input != 0):
                features = one_hot_encoder(
                    prefix='col_name',
                    table=features,
                    input_cols=features.columns[one_hot_input].tolist(),
                    suffix='label')['out_table']
                features = features[model['table_2']['features']]
        else:
            one_hot_input = model['table_3'][:-1][model['table_3']['data_type']
                                                  [:-1] == 'string'].index
            if len(one_hot_input != 0):
                features = one_hot_encoder(
                    prefix='col_name',
                    table=features,
                    input_cols=features.columns[one_hot_input].tolist(),
                    suffix='label')['out_table']
                features = features[model['table_1']['features']]
    elif 'auto' in model and 'vs' in model['_type']:
        if model['auto']:
            one_hot_input = model['table_3'][:-1][model['table_3']['data_type']
                                                  [:-1] == 'string'].index
            if len(one_hot_input != 0):
                features = one_hot_encoder(
                    prefix='col_name',
                    table=features,
                    input_cols=features.columns[one_hot_input].tolist(),
                    suffix='label')['out_table']
                features = features[model['table_2']['features']]
        else:
            one_hot_input = model['table_2'][:-1][model['table_2']['data_type']
                                                  [:-1] == 'string'].index
            if len(one_hot_input != 0):
                features = one_hot_encoder(
                    prefix='col_name',
                    table=features,
                    input_cols=features.columns[one_hot_input].tolist(),
                    suffix='label')['out_table']
                features = features[model['table_1']['features']]
    if 'lr_model' in model:
        lr_model = model['lr_model']
        classes = lr_model.classes_
        len_classes = len(classes)
        is_binary = len_classes == 2
    else:
        fit_intercept = model['fit_intercept']
        if 'vs' not in model['_type']:
            len_classes = 2
            is_binary = True
            if 'auto' in model:
                if model['auto']:
                    classes = model['table_4']['labels'].values[-1]
                    classes_type = model['table_4']['data_type'].values[-1]
                    if classes_type == 'integer' or classes_type == 'long':
                        classes = np.array([int(i) for i in classes])
                    elif classes_type == 'float' or classes_type == 'double':
                        classes = np.array([float(i) for i in classes])
                    coefficients = model['table_3']['coefficients'][0]
                    intercept = model['table_3']['intercept'][0]
                else:
                    classes = model['table_3']['labels'].values[-1]
                    classes_type = model['table_3']['data_type'].values[-1]
                    if classes_type == 'integer' or classes_type == 'long':
                        classes = np.array([int(i) for i in classes])
                    elif classes_type == 'float' or classes_type == 'double':
                        classes = np.array([float(i) for i in classes])
                    coefficients = model['table_2']['coefficients'][0]
                    intercept = model['table_2']['intercept'][0]
            else:
                classes = np.array([0, 1])
                coefficients = model['table_2']['coefficient'][1:]
                if fit_intercept:
                    intercept = model['table_2']['coefficient'][0]
        else:
            if 'auto' in model:
                if model['auto']:
                    classes = np.array(model['table_3']['labels'].values[-1])
                    len_classes = len(classes)
                    is_binary = len_classes == 2
                    intercept = model['table_2'].intercept
                    coefficients = model['table_2'].coefficients
                else:
                    classes = np.array(model['table_2']['labels'].values[-1])
                    len_classes = len(classes)
                    is_binary = len_classes == 2
                    intercept = model['table_1'].intercept
                    coefficients = model['table_1'].coefficients
            else:
                classes = np.array(model['table_1'].labelInfo)
                len_classes = len(classes)
                is_binary = len_classes == 2
                intercept = model['table_1'].intercept
                coefficients = (model['table_1'][[
                    i for i in model['table_1'].columns if 'coefficient' in i
                ]]).values
    if thresholds is None:
        thresholds = np.array([1 / len_classes for _ in classes])
    elif isinstance(thresholds, list):
        if len(thresholds) == 1 and is_binary and 0 < thresholds[0] < 1:
            thresholds = np.array([thresholds[0], 1 - thresholds[0]])
        else:
            thresholds = np.array(thresholds)
    len_thresholds = len(thresholds)
    if len_classes > 0 and len_thresholds > 0 and len_classes != len_thresholds:
        # FN-0613='%s' must have length equal to the number of classes.
        raise_error('0613', ['thresholds'])

    if 'lr_model' in model:
        prob = lr_model.predict_proba(features)
    else:
        features = features.values
        coefficients = np.array(coefficients)
        if is_binary:
            tmp = features * coefficients
            if fit_intercept or 'auto' in model:
                prob = 1 / (np.exp(np.sum(tmp, axis=1) + intercept) + 1)
            else:
                prob = 1 / (np.exp(np.sum(tmp, axis=1)) + 1)
            prob = np.array([[x, 1 - x] for x in prob])
        else:
            prob = []
            for i in range(len(coefficients)):
                tmp = features * coefficients[i]
                if fit_intercept:
                    prob.append(
                        1 / (np.exp(-np.sum(tmp, axis=1) - intercept[i]) + 1))
                else:
                    prob.append(1 / (np.exp(-np.sum(tmp, axis=1)) + 1))
            prob = np.array(prob).T
            prob = np.apply_along_axis(lambda x: x / np.sum(x), 1, prob)
    prediction = classes[np.argmax(prob / thresholds, axis=1)]

    out_table = table.copy()
    out_table[prediction_col] = prediction

    if suffix == 'index':
        suffixes = [i for i, _ in enumerate(classes)]
    else:
        suffixes = classes

    prob_cols = [
        '{probability_col}_{suffix}'.format(probability_col=prob_prefix,
                                            suffix=suffix)
        for suffix in suffixes
    ]
    prob_df = pd.DataFrame(data=prob, columns=prob_cols)

    if output_log_prob:
        log_prob = np.log(prob)
        logprob_cols = [
            '{log_probability_col}_{suffix}'.format(
                log_probability_col=log_prob_prefix, suffix=suffix)
            for suffix in suffixes
        ]
        logprob_df = pd.DataFrame(data=log_prob, columns=logprob_cols)
        out_table = pd.concat([out_table, prob_df, logprob_df], axis=1)
    else:
        out_table = pd.concat([out_table, prob_df], axis=1)

    return {'out_table': out_table}
def _random_forest_classification_train(table, feature_cols, label_col,
                                 n_estimators=10, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1,
                                 min_weight_fraction_leaf=0, max_features="sqrt",
                                 max_leaf_nodes=None, min_impurity_decrease=0, class_weight=None, random_state=None):
    
    feature_names, features_train = check_col_type(table, feature_cols)
    # X_train = table[feature_cols]
    y_train = table[label_col]

    if(type_of_target(y_train) == 'continuous'):
        raise_error('0718', 'label_col')
    
    if max_features == "n":
        max_features = None
        
    class_labels = y_train.unique()
    if class_weight is not None:
        if len(class_weight) != len(class_labels):
            raise ValueError("Number of class weights should match number of labels.")
        else:
            classes = sorted(class_labels)              
            class_weight = {classes[i] : class_weight[i] for i in range(len(classes))}
            
    classifier = RandomForestClassifier(n_estimators=n_estimators,
                                        criterion=criterion,
                                        max_depth=max_depth,
                                        min_samples_split=min_samples_split,
                                        min_samples_leaf=min_samples_leaf,
                                        min_weight_fraction_leaf=min_weight_fraction_leaf,
                                        max_features=max_features,
                                        max_leaf_nodes=max_leaf_nodes,
                                        min_impurity_decrease=min_impurity_decrease,
                                        class_weight=class_weight,
                                        random_state=random_state)
    classifier.fit(features_train, y_train)

    params = {'feature_cols': feature_cols,
             'label_col': label_col,
             'n_estimators': n_estimators,
             'criterion': criterion,
             'max_depth': max_depth,
             'min_samples_split': min_samples_split,
             'min_samples_leaf': min_samples_leaf,
             'min_weight_fraction_leaf': min_weight_fraction_leaf,
             'max_features': max_features,
             'max_leaf_nodes': max_leaf_nodes,
             'min_impurity_decrease': min_impurity_decrease,
             'class_weight': class_weight,
             'random_state': random_state}
    
    model = _model_dict('random_forest_classification_model')
    model['classifier'] = classifier
    model['params'] = params

    fig_feature_importances = _plot_feature_importances(feature_names, classifier)
           
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Random Forest Classification Train Result
    |
    | ### Parameters
    | {params}
    |
    | ### Feature Importance
    | {fig_feature_importances}
    |
    """.format(params=dict2MD(params), fig_feature_importances=fig_feature_importances)))
        
    model['_repr_brtc_'] = rb.get()
    feature_importance = classifier.feature_importances_
    feature_importance_table = pd.DataFrame([[feature_cols[i], feature_importance[i]] for i in range(len(feature_cols))], columns=['feature_name', 'importance'])
    model['feature_importance_table'] = feature_importance_table
    return {'model' : model}
Exemple #14
0
def _two_sample_ttest_for_stacked_data(table, response_cols, factor_col, alternatives, first=None , second=None , hypo_diff=0, equal_vari='pooled', confi_level=0.95):
    if first is not None or second is not None:
        check_table = np.array(table[factor_col])
        for element in check_table:
            if element is not None:
                if type(element) != str:
                    if type(element) == bool:
                        if first is not None and second is not None:
                            first = bool(first)
                            second = bool(second)
                            break
                        if first is not None:
                            first = bool(first)
                            break
                        second = bool(second)
                        break
                    else:
                        if first is not None and second is not None:
                            first = float(first)
                            second = float(second)
                            break
                        if first is not None:
                            first = float(first)
                            break
                        second = float(second)
                        break
                else:
                    break
    if first is None or second is None:
        tmp_factors=np.unique(table[factor_col])
        if len(tmp_factors) != 2:
            raise_error('0719', 'factor_col')
    if first is None:
        if tmp_factors[0] != second:
            first = tmp_factors[0]
        else:
            first = tmp_factors[1]
    if second is None:
        if tmp_factors[0] != first:
            second = tmp_factors[0]
        else:
            second = tmp_factors[1]
    table_first = table[table[factor_col] == first]
    table_second = table[table[factor_col] == second]
    tmp_table = []

    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    ## Two Sample T Test for Stacked Data Result
    | - Hypothesized mean = {hypo_diff}
    | - Confidence level = {confi_level}
    """.format(hypo_diff=hypo_diff, confi_level=confi_level)))

    for response_col in response_cols:
        tmp_model = []
        number1 = len(table_first[response_col])
        number2 = len(table_second[response_col])
        mean1 = (table_first[response_col]).mean()
        mean2 = (table_second[response_col]).mean()
        std1 = (table_first[response_col]).std()
        std2 = (table_second[response_col]).std()
        start_auto = 0
        if equal_vari == 'auto':
            start_auto = 1
            f_value = (std1 ** 2) / (std2 ** 2)
            f_test_p_value_tmp = stats.f.cdf(1 / f_value, number1 - 1, number2 - 1)
            if f_test_p_value_tmp > 0.5:
                f_test_p_value = (1 - f_test_p_value_tmp) * 2
            else:
                f_test_p_value = f_test_p_value_tmp * 2
            if f_test_p_value < 0.05:
                equal_vari = 'unequal'
            else:
                equal_vari = 'pooled'
        ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'larger', usevar=equal_vari, value=hypo_diff)

        if 'larger' in alternatives:
            ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'larger', usevar=equal_vari, value=hypo_diff)
            df = ttestresult[2]
            if equal_vari == 'pooled':
                std_number1number2 = sqrt(((number1 - 1) * (std1) ** 2 + (number2 - 1) * (std2) ** 2) / (number1 + number2 - 2))
                margin = t.ppf((confi_level) , df) * std_number1number2 * sqrt(1 / number1 + 1 / number2)
            if equal_vari == 'unequal':
                margin = t.ppf((confi_level) , df) * sqrt(std1 ** 2 / (number1) + std2 ** 2 / (number2))
            tmp_model += [['true difference in means > {}'.format(hypo_diff)] + 
            [ttestresult[1]] + [(mean1 - mean2 - margin, math.inf)]]
            tmp_table += [['%s by %s(%s,%s)' % (response_col, factor_col, first, second)] + 
            ['true difference in means > {}'.format(hypo_diff)] + 
            ['t statistic, t distribution with %f degrees of freedom under the null hypothesis' % ttestresult[2]] + 
            [ttestresult[0]] + [ttestresult[1]] + [confi_level] + [mean1 - mean2 - margin] + [math.inf]]

        if 'smaller' in alternatives:
            ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'smaller', usevar=equal_vari, value=hypo_diff)
            df = ttestresult[2]
            if equal_vari == 'pooled':    
                std_number1number2 = sqrt(((number1 - 1) * (std1) ** 2 + (number2 - 1) * (std2) ** 2) / (number1 + number2 - 2))
                margin = t.ppf((confi_level) , df) * std_number1number2 * sqrt(1 / number1 + 1 / number2)
            if equal_vari == 'unequal':
                margin = t.ppf((confi_level) , df) * sqrt(std1 ** 2 / (number1) + std2 ** 2 / (number2))
            tmp_model += [['true difference in means < {}'.format(hypo_diff)] + 
            [ttestresult[1]] + [(-math.inf, mean1 - mean2 + margin)]] 
            tmp_table += [['%s by %s(%s,%s)' % (response_col, factor_col, first, second)] + 
            ['true difference in means < {}'.format(hypo_diff)] + 
            ['t statistic, t distribution with %f degrees of freedom under the null hypothesis' % ttestresult[2]] + 
            [ttestresult[0]] + [ttestresult[1]] + [confi_level] + [-math.inf] + [mean1 - mean2 + margin]] 

        if 'two-sided' in alternatives:
            ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'two-sided', usevar=equal_vari, value=hypo_diff)
            df = ttestresult[2]
            if equal_vari == 'pooled':    
                std_number1number2 = sqrt(((number1 - 1) * (std1) ** 2 + (number2 - 1) * (std2) ** 2) / (number1 + number2 - 2))
                margin = t.ppf((confi_level + 1) / 2 , df) * std_number1number2 * sqrt(1 / number1 + 1 / number2)
            if equal_vari == 'unequal':
                margin = t.ppf((confi_level + 1) / 2 , df) * sqrt(std1 ** 2 / (number1) + std2 ** 2 / (number2))
            tmp_model += [['true difference in means != {}'.format(hypo_diff)] + 
            [ttestresult[1]] + [(mean1 - mean2 - margin, mean1 - mean2 + margin)]]
            tmp_table += [['%s by %s(%s,%s)' % (response_col, factor_col, first, second)] + 
            ['true difference in means != {}'.format(hypo_diff)] + 
            ['t statistic, t distribution with %f degrees of freedom under the null hypothesis' % ttestresult[2]] + 
            [ttestresult[0]] + [ttestresult[1]] + [confi_level] + [mean1 - mean2 - margin] + [mean1 - mean2 + margin]]

        result_model = pd.DataFrame.from_records(tmp_model)
        result_model.columns = ['alternative hypothesis', 'p-value', '%g%% confidence interval' % (confi_level * 100)]
        rb.addMD(strip_margin("""
        | #### Data = {response_col} by {factor_col}({first},{second})
        
        | - Statistics = t statistic, t distribution with {ttestresult2} degrees of freedom under the null hypothesis
        | - t-value = {ttestresult0}
        |
        | {result_model}
        |
        """.format(ttestresult2=ttestresult[2], response_col=response_col, factor_col=factor_col, first=first, second=second, ttestresult0=ttestresult[0], result_model=pandasDF2MD(result_model))))
        if start_auto == 1:
            equal_vari = 'auto'
    result = pd.DataFrame.from_records(tmp_table)
    result.columns = ['data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'lower_confidence_interval', 'upper_confidence_interval']

    model = dict()
    model['_repr_brtc_'] = rb.get()    
    return {'out_table' : result, 'model' : model}
def _random_forest_classification_train(table,
                                        feature_cols,
                                        label_col,
                                        n_estimators=10,
                                        criterion="gini",
                                        max_depth=None,
                                        min_samples_split=2,
                                        min_samples_leaf=1,
                                        min_weight_fraction_leaf=0,
                                        max_features="sqrt",
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0,
                                        random_state=None):

    X_train = table[feature_cols]
    y_train = table[label_col]

    if (sklearn_utils.multiclass.type_of_target(y_train) == 'continuous'):
        raise_error('0718', 'label_col')

    if max_features == "None":
        max_features = None

    classifier = RandomForestClassifier(
        n_estimators=n_estimators,
        criterion=criterion,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        min_weight_fraction_leaf=min_weight_fraction_leaf,
        max_features=max_features,
        max_leaf_nodes=max_leaf_nodes,
        min_impurity_decrease=min_impurity_decrease,
        random_state=random_state)
    classifier.fit(X_train, y_train)

    params = {
        'feature_cols': feature_cols,
        'label_col': label_col,
        'n_estimators': n_estimators,
        'criterion': criterion,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'min_weight_fraction_leaf': min_weight_fraction_leaf,
        'max_features': max_features,
        'max_leaf_nodes': max_leaf_nodes,
        'min_impurity_decrease': min_impurity_decrease,
        'random_state': random_state
    }

    model = dict()
    model['classifier'] = classifier
    model['params'] = params

    fig_feature_importances = _plot_feature_importances(
        feature_cols, classifier)

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Random Forest Classification Train Result
    |
    | ### Feature Importance
    | {fig_feature_importances}
    |
    """.format(fig_feature_importances=fig_feature_importances)))

    model['_repr_brtc_'] = rb.get()

    return {'model': model}