コード例 #1
0
ファイル: knn_regression.py プロジェクト: yemode2k/studio
def _knn_regression(train_table,
                    test_table,
                    feature_cols,
                    label_col,
                    k=5,
                    algorithm='auto',
                    leaf_size=30,
                    p=2,
                    pred_col_name='prediction'):
    if (test_table.shape[0] == 0):
        new_cols = test_table.columns.tolist() + [pred_col_name]
        out_table = pd.DataFrame(columns=new_cols)
        return {'out_table': out_table}
    _, X_train = check_col_type(train_table, feature_cols)
    y_train = train_table[label_col]
    _, X_test = check_col_type(test_table, feature_cols)

    knn = KNeighborsRegressor(n_neighbors=k,
                              algorithm=algorithm,
                              leaf_size=leaf_size,
                              p=p)

    out_col_pred = pd.DataFrame()

    # Predict the class labels for the provided data
    knn.fit(X_train, y_train)
    pred = knn.predict(X_test)
    out_col_pred[pred_col_name] = pred

    # Result
    out_table = pd.concat([test_table.reset_index(drop=True), out_col_pred],
                          axis=1)
    return {'out_table': out_table}
コード例 #2
0
ファイル: knn_classification.py プロジェクト: yemode2k/studio
def _knn_classification(train_table,
                        test_table,
                        feature_cols,
                        label_col,
                        k=5,
                        algorithm='auto',
                        leaf_size=30,
                        p=2,
                        pred_col_name='prediction',
                        prob_col_prefix='probability',
                        suffix='index'):

    _, X_train = check_col_type(train_table, feature_cols)
    y_train = train_table[label_col]
    _, X_test = check_col_type(test_table, feature_cols)

    if (sklearn_utils.multiclass.type_of_target(y_train) == 'continuous'):
        raise_error('0718', 'label_col')

    knn = KNeighborsClassifier(n_neighbors=k,
                               algorithm=algorithm,
                               leaf_size=leaf_size,
                               p=p)

    # Predict the class labels for the provided data
    knn.fit(X_train, y_train)
    classes = knn.classes_
    if (test_table.shape[0] == 0):
        new_cols = test_table.columns.tolist() + [pred_col_name]
        if suffix == 'index':
            prob_cols = [
                prob_col_prefix + '_{}'.format(i) for i in range(len(classes))
            ]
        else:
            prob_cols = [prob_col_prefix + '_{}'.format(i) for i in classes]
        new_cols += prob_cols
        out_table = pd.DataFrame(columns=new_cols)
        return {'out_table': out_table}
    pred = knn.predict(X_test)
    out_col_pred = pd.DataFrame(pred, columns=[pred_col_name])

    if suffix == 'index':
        suffixes = [i for i, _ in enumerate(classes)]
    else:
        suffixes = classes

    # Return probability estimates for the test data
    prob = knn.predict_proba(X_test)
    prob_col_name = [
        '{prob_col_prefix}_{suffix}'.format(prob_col_prefix=prob_col_prefix,
                                            suffix=suffix)
        for suffix in suffixes
    ]
    out_col_prob = pd.DataFrame(data=prob, columns=prob_col_name)

    # Result
    out_table = pd.concat(
        [test_table.reset_index(drop=True), out_col_pred, out_col_prob],
        axis=1)
    return {'out_table': out_table}
コード例 #3
0
def _ada_boost_regression_predict(table, model, pred_col_name='prediction'):
    out_table = table.copy()
    regressor = model['regressor']
    _, test_data = check_col_type(table, model['params']['feature_cols'])

    out_table[pred_col_name] = regressor.predict(test_data)
    return {'out_table': out_table}
コード例 #4
0
def _logistic_regression_predict(table,
                                 model,
                                 prediction_col='prediction',
                                 prob_prefix='probability',
                                 output_log_prob=False,
                                 log_prob_prefix='log_probability',
                                 thresholds=None,
                                 suffix='index'):
    feature_cols = model['features']
    feature_names, features = check_col_type(table, feature_cols)
    lr_model = model['lr_model']
    classes = lr_model.classes_
    len_classes = len(classes)
    is_binary = len_classes == 2

    if thresholds is None:
        thresholds = np.array([1 / len_classes for _ in classes])
    elif isinstance(thresholds, list):
        if len(thresholds) == 1 and is_binary and 0 < thresholds[0] < 1:
            thresholds = np.array([thresholds[0], 1 - thresholds[0]])
        else:
            thresholds = np.array(thresholds)

    len_thresholds = len(thresholds)
    if len_classes > 0 and len_thresholds > 0 and len_classes != len_thresholds:
        # FN-0613='%s' must have length equal to the number of classes.
        raise_error('0613', ['thresholds'])

    prob = lr_model.predict_proba(features)
    prediction = pd.DataFrame(prob).apply(
        lambda x: classes[np.argmax(x / thresholds)], axis=1)

    out_table = table.copy()
    out_table[prediction_col] = prediction

    if suffix == 'index':
        suffixes = [i for i, _ in enumerate(classes)]
    else:
        suffixes = classes

    prob_cols = [
        '{probability_col}_{suffix}'.format(probability_col=prob_prefix,
                                            suffix=suffix)
        for suffix in suffixes
    ]
    prob_df = pd.DataFrame(data=prob, columns=prob_cols)

    if output_log_prob:
        log_prob = lr_model.predict_log_proba(features)
        logprob_cols = [
            '{log_probability_col}_{suffix}'.format(
                log_probability_col=log_prob_prefix, suffix=suffix)
            for suffix in suffixes
        ]
        logprob_df = pd.DataFrame(data=log_prob, columns=logprob_cols)
        out_table = pd.concat([out_table, prob_df, logprob_df], axis=1)
    else:
        out_table = pd.concat([out_table, prob_df], axis=1)

    return {'out_table': out_table}
コード例 #5
0
def _ada_boost_classification_predict(table, model, pred_col_name='prediction', prob_col_prefix='probability', suffix='index'):
    if (table.shape[0] == 0):
        new_cols = table.columns.tolist() + [pred_col_name]
        classes = model['classifier'].classes_
        if suffix == 'index':
            prob_cols = [prob_col_prefix + '_{}'.format(i) for i in range(len(classes))]
        else:
            prob_cols = [prob_col_prefix + '_{}'.format(i) for i in classes]
        new_cols += prob_cols
        out_table = pd.DataFrame(columns=new_cols)
        return {'out_table': out_table}
    out_table = table.copy()
    classifier = model['classifier']
    _, test_data = check_col_type(table, model['params']['feature_cols'])
    
    out_table[pred_col_name] = classifier.predict(test_data)   
    
    classes = classifier.classes_
    if suffix == 'index':
        suffixes = [i for i, _ in enumerate(classes)]
    else:
        suffixes = classes
        
    prob = classifier.predict_proba(test_data)
    prob_col_name = ['{prob_col_prefix}_{suffix}'.format(prob_col_prefix=prob_col_prefix, suffix=suffix) for suffix in suffixes]    
    out_col_prob = pd.DataFrame(data=prob, columns=prob_col_name)

    out_table = pd.concat([out_table, out_col_prob], axis=1)
    return {'out_table': out_table}
コード例 #6
0
def _mean_shift_samples_plot(table, input_cols, n_samples, cluster_centers,
                             colors):
    sample = table[input_cols].sample(
        n=n_samples) if n_samples is not None else table[input_cols]
    feature_names, sample = check_col_type(sample, input_cols)
    sum_len_cols = np.sum([len(col) for col in feature_names])
    x = range(len(feature_names))
    if sum_len_cols >= 512:
        plt.xticks(x, feature_names, rotation='vertical')
    elif sum_len_cols >= 64:
        plt.xticks(x, feature_names, rotation=45, ha='right')
    else:
        plt.xticks(x, feature_names)
    if feature_names == input_cols:
        for idx in sample.index:
            plt.plot(x, sample.transpose()[idx], color='grey', linewidth=1)
    else:
        for idx in range(len(sample)):
            plt.plot(x, sample[idx], color='grey', linewidth=1)
    for idx, centers in enumerate(cluster_centers):
        plt.plot(x, centers, "o-", linewidth=4, color=colors[idx])
    plt.tight_layout()
    fig_samples = plt2MD(plt)
    plt.clf()
    return fig_samples
コード例 #7
0
def _ada_boost_classification_predict(table,
                                      model,
                                      pred_col_name='prediction',
                                      prob_col_prefix='probability',
                                      suffix='index'):
    out_table = table.copy()
    classifier = model['classifier']
    _, test_data = check_col_type(table, model['params']['feature_cols'])

    out_table[pred_col_name] = classifier.predict(test_data)

    classes = classifier.classes_
    if suffix == 'index':
        suffixes = [i for i, _ in enumerate(classes)]
    else:
        suffixes = classes

    prob = classifier.predict_proba(test_data)
    prob_col_name = [
        '{prob_col_prefix}_{suffix}'.format(prob_col_prefix=prob_col_prefix,
                                            suffix=suffix)
        for suffix in suffixes
    ]
    out_col_prob = pd.DataFrame(data=prob, columns=prob_col_name)

    out_table = pd.concat([out_table, out_col_prob], axis=1)
    return {'out_table': out_table}
コード例 #8
0
def _ada_boost_regression_train(table,
                                feature_cols,
                                label_col,
                                max_depth=3,
                                n_estimators=50,
                                learning_rate=1.0,
                                loss='linear',
                                random_state=None):

    feature_names, x_train = check_col_type(table, feature_cols)
    y_train = table[label_col]

    base_estimator = DecisionTreeRegressor(max_depth=max_depth)
    regressor = AdaBoostRegressor(base_estimator, n_estimators, learning_rate,
                                  loss, random_state)

    regressor.fit(x_train, y_train)

    params = {
        'feature_cols': feature_cols,
        'label_col': label_col,
        'feature_importance': regressor.feature_importances_,
        'n_estimators': n_estimators,
        'learning_rate': learning_rate,
        'loss': loss,
        'random_state': random_state
    }

    model = _model_dict('ada_boost_regression_model')
    get_param = regressor.get_params()
    model['parameters'] = get_param
    model['regressor'] = regressor
    model['params'] = params

    fig_feature_importance = _plot_feature_importance(feature_names, regressor)
    params = dict2MD(get_param)

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## AdaBoost Regression Train Result
    |
    | ### Feature Importance
    | {fig_feature_importance}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_feature_importance=fig_feature_importance,
               list_parameters=params)))

    model['_repr_brtc_'] = rb.get()
    feature_importance = regressor.feature_importances_
    feature_importance_table = pd.DataFrame(
        [[feature_names[i], feature_importance[i]]
         for i in range(len(feature_names))],
        columns=['feature_name', 'importance'])
    model['feature_importance_table'] = feature_importance_table
    return {'model': model}
コード例 #9
0
def _logistic_regression_train(table, feature_cols, label_col, penalty='l2', dual=False, tol=0.0001, C=1.0,
                               fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None,
                               solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False,
                               n_jobs=1):

    feature_names, features = check_col_type(table,feature_cols)
    label = table[label_col]

    if(sklearn_utils.multiclass.type_of_target(label) == 'continuous'):
        raise_error('0718', 'label_col')
    
    lr_model = LogisticRegression(penalty, dual, tol, C, fit_intercept, intercept_scaling, class_weight, random_state,
                                  solver, max_iter, multi_class, verbose, warm_start, n_jobs)
    lr_model.fit(features, label)

    intercept = lr_model.intercept_
    coefficients = lr_model.coef_
    classes = lr_model.classes_
    is_binary = len(classes) == 2
    if (fit_intercept == True):
        summary = pd.DataFrame({'features': ['intercept'] + feature_names})
        print(intercept)
        print(coefficients)
        
        coef_trans = np.concatenate(([intercept], np.transpose(coefficients)), axis=0)
        if not is_binary:
            summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=classes)), axis=1)
        elif is_binary:
            summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1)
            
    else:
        summary = pd.DataFrame({'features': feature_names})
        coef_trans = np.transpose(coefficients)
        
        if not is_binary:
            summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=classes)), axis=1)
        elif is_binary:
            summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1)
        
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Logistic Regression Result
    | ### Summary
    | {table1}
    """.format(table1=pandasDF2MD(summary)
               )))

    model = _model_dict('logistic_regression_model')
    model['features'] = feature_cols
    model['label'] = label_col
    model['intercept'] = lr_model.intercept_
    model['coefficients'] = lr_model.coef_
    model['class'] = lr_model.classes_
    model['penalty'] = penalty
    model['solver'] = solver
    model['lr_model'] = lr_model
    model['_repr_brtc_'] = rb.get()

    return {'model' : model}
コード例 #10
0
def _pls_regression_predict(table, model, prediction_col='prediction'):
    result = table.copy()
    feature_cols = model['feature_cols']
    _, features = check_col_type(result, feature_cols)
    pls_model = model['pls_model']
    prediction = pls_model.predict(features)
    for i in range(prediction.shape[-1]):
        result[prediction_col+"_{}".format(i)] = prediction[:, i]
    return {'out_table': result}
コード例 #11
0
def _mean_shift_predict(table, model, prediction_col='prediction'):
    ms = model['model']
    input_cols = model['input_cols']
    _, inputarr = check_col_type(table, input_cols)
    predict = ms.predict(inputarr)
    out_table = table.copy()
    out_table[prediction_col] = predict

    return {'out_table': out_table}
コード例 #12
0
def _random_forest_regression_train(table, feature_cols, label_col,
                                 n_estimators=10, criterion="mse", max_depth=None, min_samples_split=2, min_samples_leaf=1,
                                 min_weight_fraction_leaf=0, max_features="None",
                                 max_leaf_nodes=None, min_impurity_decrease=0, random_state=None):   
    
    feature_names, X_train = check_col_type(table, feature_cols)
    y_train = table[label_col]   
    
    if max_features == "None":
        max_features = None
            
    regressor = RandomForestRegressor(n_estimators =n_estimators,
                                      criterion = criterion, 
                                      max_depth= max_depth, 
                                      min_samples_split = min_samples_split, 
                                      min_samples_leaf = min_samples_leaf, 
                                      min_weight_fraction_leaf = min_weight_fraction_leaf, 
                                      max_features = max_features, 
                                      max_leaf_nodes = max_leaf_nodes, 
                                      min_impurity_decrease = min_impurity_decrease, 
                                      random_state=random_state)
    regressor.fit(X_train, y_train) 

    params = {'feature_cols': feature_cols,
              'label_col': label_col,
              'n_estimators': n_estimators,
              'criterion': criterion,
              'max_depth': max_depth,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf,
              'min_weight_fraction_leaf': min_weight_fraction_leaf,
              'max_features': max_features,
              'max_leaf_nodes': max_leaf_nodes,
              'min_impurity_decrease': min_impurity_decrease,
              'random_state': random_state}
    
    model = _model_dict('random_forest_regression_model')
    model['regressor'] = regressor
    model['params'] = params

    fig_feature_importances = _plot_feature_importances(feature_names, regressor)
           
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Random Forest Regression Train Result
    |
    | ### Feature Importance
    | {fig_feature_importances}
    |
    """.format(fig_feature_importances=fig_feature_importances))) 
        
    model['_repr_brtc_'] = rb.get()   
    feature_importance = regressor.feature_importances_
    feature_importance_table = pd.DataFrame([[feature_names[i],feature_importance[i]] for i in range(len(feature_names))],columns = ['feature_name','importance'])
    model['feature_importance_table'] = feature_importance_table
    return {'model' : model}
コード例 #13
0
def _decision_tree_regression_predict(table, model, prediction_col='prediction',
                                     check_input=True):
    out_table = table.copy()
    feature_cols = model['feature_cols']
    feature_names, features = check_col_type(table,feature_cols)
    regressor = model['regressor']
    prediction = regressor.predict(features, check_input)
    out_table[prediction_col] = prediction
    
    return {'out_table': out_table}
コード例 #14
0
def _decision_tree_classification_predict(table,
                                          model,
                                          prediction_col='prediction',
                                          check_input=True):
    out_table = table.copy()
    feature_cols = model['feature_cols']
    feature_names, features = check_col_type(table, feature_cols)
    out_table[prediction_col] = model['classifier'].predict(
        features, check_input)
    return {'out_table': out_table}
コード例 #15
0
def _pls_regression_train(table, feature_cols, label_cols, n_components=2, scale=True, max_iter=500, tol=1e-6):
    pls_model = PLS(n_components=n_components, scale=scale, max_iter=max_iter, tol=tol)
    _, features = check_col_type(table, feature_cols)
    _, labels = check_col_type(table, label_cols)
    pls_model.fit(features, labels)
    predict = pls_model.predict(features)
    _mean_absolute_error = mean_absolute_error(labels, predict)
    _mean_squared_error = mean_squared_error(labels, predict)
    _r2_score = r2_score(labels, predict)
    result_table = pd.DataFrame.from_items([
        ['Metric', ['Mean Absolute Error', 'Mean Squared Error', 'R2 Score']],
        ['Score', [_mean_absolute_error, _mean_squared_error, _r2_score]]
    ])
    label_name = {
        'n_components': 'Number of components',
        'scale': "Scale",
        'max_iter': 'Max iteration',
        'tol': 'Tolerance'
    }
    get_param = pls_model.get_params()
    param_table = pd.DataFrame.from_items([
        ['Parameter', list(label_name.values())],
        ['Value', [get_param[x] for x in list(label_name.keys())]]
    ])
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ### PLS Regression Result
    | {result}
    | ### Parameters
    | {list_parameters}
    """.format(result=pandasDF2MD(result_table), list_parameters=pandasDF2MD(param_table)
               )))
    model = _model_dict('pls_regression_model')
    model['feature_cols'] = feature_cols
    model['label'] = label_cols
    model['mean_absolute_error'] = _mean_absolute_error
    model['mean_squared_error'] = _mean_squared_error
    model['r2_score'] = _r2_score
    model['max_iter'] = max_iter
    model['tol'] = tol
    model['pls_model'] = pls_model
    model['_repr_brtc_'] = rb.get()
    return {'model': model}
コード例 #16
0
def _ada_boost_regression_predict(table, model, pred_col_name='prediction'):
    if (table.shape[0] == 0):
        new_cols = table.columns.tolist() + [pred_col_name]
        out_table = pd.DataFrame(columns=new_cols)
        return {'out_table': out_table}
    out_table = table.copy()
    regressor = model['regressor']
    _, test_data = check_col_type(table, model['params']['feature_cols'])

    out_table[pred_col_name] = regressor.predict(test_data)
    return {'out_table': out_table}
コード例 #17
0
ファイル: kmeans.py プロジェクト: yemode2k/studio
def _kmeans_predict(table, model, prediction_col='prediction'):
    if model['_context'] == 'python' and model['_type'] == 'kmeans':
        k_means = model['model']
        input_cols = model['input_cols']
        feature_names, features = check_col_type(table, input_cols)
        predict = k_means.predict(features)
        out_table = table.copy()
        out_table[prediction_col] = predict
    elif model['_context'] == 'python' and model['_type'] == 'kmeans_silhouette':
        k_means = model['best_model']
        input_cols = model['input_cols']
        feature_names, features = check_col_type(table, input_cols)
        predict = k_means.predict(features)
        out_table = table.copy()
        out_table[prediction_col] = predict
    else:
        raise_runtime_error("Unsupported model")
        # raise Exception("Unsupported model")
    
    return {'out_table':out_table}
コード例 #18
0
def _penalized_linear_regression_predict(table,
                                         model,
                                         prediction_col='prediction'):
    result = table.copy()
    feature_cols = model['feature_cols']
    feature_names, features = check_col_type(result, feature_cols)
    regression_model = model['regression_model']
    prediction = regression_model.predict(features)

    result[prediction_col] = prediction

    return {'out_table': result}
コード例 #19
0
ファイル: gaussian_mixture.py プロジェクト: parkjh80/studio
def _gaussian_mixture_predict(table,
                              model,
                              display_probability,
                              prediction_col_name='prediction'):

    out_table = table.copy()
    _, inputarr = check_col_type(table, model['input_cols'])
    out_table[prediction_col_name] = model['gmm'].predict(inputarr)
    if display_probability == True:
        for i in range(0, model['number_of_components']):
            out_table['probability_' + str(i)] = pd.DataFrame(
                model['gmm'].predict_proba(table[model['input_cols']]))[i]
    return {'out_table': out_table}
コード例 #20
0
ファイル: mlp_regression.py プロジェクト: yemode2k/studio
def _mlp_regression_predict(table, model, prediction_col='prediction'):

    result = table.copy()
    feature_cols = model['features']
    _, features = check_col_type(result, feature_cols)

    mlp_model_fit = model['mlp_model']

    prediction = mlp_model_fit.predict(features)

    result[prediction_col] = prediction

    return {'out_table': result}
コード例 #21
0
def _svm_classification_train(table,
                              feature_cols,
                              label_col,
                              c=1.0,
                              kernel='rbf',
                              degree=3,
                              gamma='auto',
                              coef0=0.0,
                              shrinking=True,
                              probability=True,
                              tol=1e-3,
                              max_iter=-1,
                              random_state=None):
    _table = table.copy()

    feature_names, features = check_col_type(table, feature_cols)
    _label_col = _table[label_col]

    if (sklearn_utils.multiclass.type_of_target(_label_col) == 'continuous'):
        raise_runtime_error('''Label Column should not be continuous.''')

    _svc = svm.SVC(C=c,
                   kernel=kernel,
                   degree=degree,
                   gamma=gamma,
                   coef0=coef0,
                   shrinking=shrinking,
                   probability=probability,
                   tol=tol,
                   max_iter=max_iter,
                   random_state=random_state)
    _svc_model = _svc.fit(features, _label_col)

    get_param = _svc.get_params()
    get_param['feature_cols'] = feature_names
    get_param['label_col'] = label_col

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## SVM Classification Result
    | ### Parameters
    | {table_parameter} 
    """.format(table_parameter=dict2MD(get_param))))

    _model = _model_dict('svc_model')
    _model['svc_model'] = _svc_model
    _model['features'] = feature_cols
    _model['_repr_brtc_'] = rb.get()

    return {'model': _model}
コード例 #22
0
ファイル: kmeans.py プロジェクト: yemode2k/studio
def _kmeans_train_predict(table, input_cols, n_clusters=3, prediction_col='prediction', init='k-means++', n_init=10,
             max_iter=300, tol=1e-4, precompute_distances='auto', seed=None,
             n_jobs=1, algorithm='auto', n_samples=None):
    feature_names, inputarr = check_col_type(table, input_cols)
    if n_samples is None:
        n_samples = len(inputarr)
        
    k_means = SKKMeans(n_clusters=n_clusters, init=init, n_init=n_init,
             max_iter=max_iter, tol=tol, precompute_distances=precompute_distances,
             verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm)
    
    k_means.fit(inputarr)
    
    params = {'input_cols':feature_names, 'n_clusters':n_clusters, 'init':init, 'n_init':n_init, 'max_iter':max_iter, 'tol':tol,
              'precompute_distances':precompute_distances, 'seed':seed, 'n_jobs':n_jobs, 'algorithm':algorithm, 'n_samples':n_samples}
    
    cluster_centers = k_means.cluster_centers_
    n_clusters = len(cluster_centers)
    colors = cm.nipy_spectral(np.arange(n_clusters).astype(float) / n_clusters)
    labels = k_means.labels_
    
    pca2_model = PCA(n_components=min(2, len(feature_names))).fit(inputarr)
    pca2 = pca2_model.transform(inputarr)
    
    fig_centers = _kmeans_centers_plot(feature_names, cluster_centers, colors)
    fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, cluster_centers, seed, colors)
    fig_pca = _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2, colors)
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Kmeans Result
    | - Number of iterations run: {n_iter_}.
    | - Sum of square error: {sse_}.
    | - Coordinates of cluster centers
    | {fig_cluster_centers} 
    | - Samples
    | {fig_pca}
    | {fig_samples}
    |
    | ### Parameters
    | {params}
    """.format(n_iter_=k_means.n_iter_, sse_=k_means.inertia_, fig_cluster_centers=fig_centers, fig_pca=fig_pca, fig_samples=fig_samples, params=dict2MD(params))))
    
    model = _model_dict('kmeans')
    model['model'] = k_means
    model['input_cols'] = input_cols
    model['_repr_brtc_'] = rb.get()
    
    out_table = table.copy()
    out_table[prediction_col] = labels
    return {'out_table':out_table, 'model':model}
コード例 #23
0
ファイル: xgb_regression.py プロジェクト: yemode2k/studio
def _xgb_regression_predict(table, model, prediction_col='prediction',
            output_margin=False, ntree_limit=None):
        
    feature_cols = model['feature_cols']
    feature_names, features = check_col_type(table, feature_cols)
    regressor = model['regressor']
    prediction = regressor.predict(features, output_margin, ntree_limit)
#         prediction_df = pd.DataFrame(data = prediction)
#         
#         out_df = pd.concat([table.reset_index(drop=True), prediction_df], axis=1)
#         out_df.columns = table.columns.values.tolist() + [prediction_col]
    out_table = table.copy()
    out_table[prediction_col] = prediction
    
    return {'out_table': out_table}
コード例 #24
0
def _glm_predict(table, model, prediction_col='prediction'):
    feature_cols = model['features']
    feature_names, features = check_col_type(table, feature_cols)

    fit_intercept = model['fit_intercept']
    glm_model = model['glm_model']

    if fit_intercept == True:
        prediction = glm_model.predict(sm.add_constant(features))
    else:
        prediction = glm_model.predict(features)

    result = table.copy()
    result[prediction_col] = prediction

    return {'out_table': result}
コード例 #25
0
def _xgb_classification_predict(table,
                                model,
                                prediction_col='prediction',
                                probability_col='probability',
                                thresholds=None,
                                suffix='index',
                                output_margin=False,
                                ntree_limit=None):
    feature_cols = model['feature_cols']
    classifier = model['classifier']

    # prediction = classifier.predict(table[feature_cols], output_margin, ntree_limit)
    _, features = check_col_type(table, feature_cols)
    classes = classifier.classes_
    len_classes = len(classes)
    is_binary = len_classes == 2

    if thresholds is None:
        thresholds = np.array([1 / len_classes for _ in classes])
    elif isinstance(thresholds, list):
        if len(thresholds) == 1 and is_binary and 0 < thresholds[0] < 1:
            thresholds = np.array([thresholds[0], 1 - thresholds[0]])
        else:
            thresholds = np.array(thresholds)

    prob = classifier.predict_proba(features, ntree_limit)
    prediction = classes[np.argmax(prob / thresholds, axis=1)]

    if suffix == 'index':
        suffixes = [i for i, _ in enumerate(classes)]
    else:
        suffixes = classes

    prob_cols = [
        '{probability_col}_{suffix}'.format(probability_col=probability_col,
                                            suffix=suffix)
        for suffix in suffixes
    ]
    prob_df = pd.DataFrame(data=prob, columns=prob_cols)

    result = table.copy()
    result[prediction_col] = prediction
    result = pd.concat([result, prob_df], axis=1)

    return {'out_table': result}
コード例 #26
0
def _naive_bayes_predict(table, model, suffix, display_log_prob=False, prediction_col='prediction', prob_prefix='probability', log_prob_prefix='log_probability'):
    if 'features' in model:
        feature_cols = model['features']
    else:
        feature_cols = model['feature_cols']
    feature_names, features = check_col_type(table, feature_cols)
    if 'nb_model' in model:
        nb_model = model['nb_model']
    else:
        model_table = model['table_1']
        if model_table.model_type[0] == 'multinomial':
            nb_model = MultinomialNB()
        else:
            nb_model = BernoulliNB()
        nb_model.fit([[1]], [1])
        nb_model.classes_ = np.array([0, 1])
        nb_model.class_log_prior_ = model_table.pi.values
        nb_model.feature_log_prob_ = np.array(list(model_table.theta))
    prediction = nb_model.predict(features)
    if 'label_encoder' in model:
        label_encoder = model['label_encoder']
        prediction = label_encoder.inverse_transform(prediction)
        if suffix == 'label':
            suffixes = label_encoder.classes_
        else:
            suffixes = range(0, len(label_encoder.classes_))
    else:
        suffixes = [0, 1]

    prob = nb_model.predict_proba(features)    
    prob_cols = ['{prefix}_{suffix}'.format(prefix=prob_prefix, suffix=suffix) for suffix in suffixes]
    prob_df = pd.DataFrame(data=prob, columns=prob_cols)

    result = table
    result[prediction_col] = prediction

    if display_log_prob == True:
        log_prob = nb_model.predict_log_proba(features)
        logprob_cols = ['{prefix}_{suffix}'.format(prefix=log_prob_prefix, suffix=suffix) for suffix in suffixes]
        logprob_df = pd.DataFrame(data=log_prob, columns=logprob_cols)
        result = pd.concat([result, prob_df, logprob_df], axis=1)
    else:
        result = pd.concat([result, prob_df], axis=1)

    return {'out_table' : result}
コード例 #27
0
ファイル: linear_regression.py プロジェクト: steelblu/studio
def _linear_regression_predict(table, model, prediction_col='prediction'):

    result = table.copy()
    feature_cols = model['features']
    feature_names, features = check_col_type(result, feature_cols)
    fit_intercept = model['fit_intercept']

    lr_model_fit = model['lr_model']

    if fit_intercept == True:
        features = sm.add_constant(features, has_constant='add')
        prediction = lr_model_fit.predict(features)
    else:
        prediction = lr_model_fit.predict(features)

    result[prediction_col] = prediction

    return {'out_table': result}
コード例 #28
0
ファイル: kmeans.py プロジェクト: yemode2k/studio
def _kmeans_samples_plot(table, input_cols, n_samples, cluster_centers, seed, colors):
    feature_names, inputarr = check_col_type(table, input_cols)
    sum_len_cols = np.sum([len(col) for col in feature_names])
    sample = pd.DataFrame(inputarr).sample(n=n_samples, random_state=seed)
    x = range(len(feature_names))
    if sum_len_cols >= 512:
        plt.xticks(x, feature_names, rotation='vertical')
    elif sum_len_cols >= 64:
        plt.xticks(x, feature_names, rotation=45, ha='right')
    else:
        plt.xticks(x, feature_names)
    for idx in sample.index:
        plt.plot(x, sample.transpose()[idx], color='grey', linewidth=1)
    for idx, centers in enumerate(cluster_centers):
        plt.plot(x, centers, "o-", label=idx, linewidth=2, color=colors[idx])
    plt.tight_layout()
    fig_samples = plt2MD(plt)
    plt.clf()
    return fig_samples
コード例 #29
0
def _naive_bayes_predict(table,
                         model,
                         suffix,
                         display_log_prob=False,
                         prediction_col='prediction',
                         prob_prefix='probability',
                         log_prob_prefix='log_probability'):
    feature_cols = model['features']
    feature_names, features = check_col_type(table, feature_cols)
    nb_model = model['nb_model']
    label_encoder = model['label_encoder']

    prediction_correspond = nb_model.predict(features)
    prediction = label_encoder.inverse_transform(prediction_correspond)

    if suffix == 'label':
        suffixes = label_encoder.classes_
    else:
        suffixes = range(0, len(label_encoder.classes_))

    prob = nb_model.predict_proba(features)
    prob_cols = [
        '{prefix}_{suffix}'.format(prefix=prob_prefix, suffix=suffix)
        for suffix in suffixes
    ]
    prob_df = pd.DataFrame(data=prob, columns=prob_cols)

    result = table
    result[prediction_col] = prediction

    if display_log_prob == True:
        log_prob = nb_model.predict_log_proba(features)
        logprob_cols = [
            '{prefix}_{suffix}'.format(prefix=log_prob_prefix, suffix=suffix)
            for suffix in suffixes
        ]
        logprob_df = pd.DataFrame(data=log_prob, columns=logprob_cols)
        result = pd.concat([result, prob_df, logprob_df], axis=1)
    else:
        result = pd.concat([result, prob_df], axis=1)

    return {'out_table': result}
コード例 #30
0
def _logistic_regression_train(table,
                               feature_cols,
                               label_col,
                               penalty='l2',
                               dual=False,
                               tol=0.0001,
                               C=1.0,
                               fit_intercept=True,
                               intercept_scaling=1,
                               class_weight=None,
                               random_state=None,
                               solver='liblinear',
                               max_iter=100,
                               multi_class='ovr',
                               verbose=0,
                               warm_start=False,
                               n_jobs=1):

    feature_names, features = check_col_type(table, feature_cols)
    label = table[label_col]

    if (sklearn_utils.multiclass.type_of_target(label) == 'continuous'):
        raise_error('0718', 'label_col')

    lr_model = LogisticRegression(penalty, dual, tol, C, fit_intercept,
                                  intercept_scaling, class_weight,
                                  random_state, solver, max_iter, multi_class,
                                  verbose, warm_start, n_jobs)
    lr_model.fit(features, label)
    new_features = pd.DataFrame({
        "Constant": np.ones(len(features))
    }).join(pd.DataFrame(features))
    intercept = lr_model.intercept_
    coefficients = lr_model.coef_
    classes = lr_model.classes_
    is_binary = len(classes) == 2
    prob = lr_model.predict_proba(features)
    prob_trans = prob.T
    classes_dict = dict()
    for i in range(len(classes)):
        classes_dict[classes[i]] = i
    tmp_label = np.array([classes_dict[i] for i in label])
    likelihood = 1
    for i in range(len(table)):
        likelihood *= prob_trans[tmp_label[i]][i]
    if fit_intercept:
        k = len(feature_cols) + 1
    else:
        k = len(feature_cols)
    aic = 2 * k - 2 * np.log(likelihood)
    bic = np.log(len(table)) * k - 2 * np.log(likelihood)
    if is_binary:
        if fit_intercept:
            x_design = np.hstack([np.ones((features.shape[0], 1)), features])
        else:
            x_design = features.values
        v = np.product(prob, axis=1)
        x_design_modi = np.array(
            [x_design[i] * v[i] for i in range(len(x_design))])
        cov_logit = np.linalg.inv(np.dot(x_design_modi.T, x_design))
        std_err = np.sqrt(np.diag(cov_logit))
        if fit_intercept:
            logit_params = np.insert(coefficients, 0, intercept)
        else:
            logit_params = coefficients
        wald = (logit_params / std_err)**2
        p_values = 1 - chi2.cdf(wald, 1)
    else:
        if fit_intercept:
            x_design = np.hstack([np.ones((features.shape[0], 1)), features])
        else:
            x_design = features.values
        std_err = []
        for i in range(len(classes)):
            v = prob.T[i] * (1 - prob.T[i])
            x_design_modi = np.array(
                [x_design[i] * v[i] for i in range(len(x_design))])
            cov_logit = np.linalg.inv(np.dot(x_design_modi.T, x_design))
            std_err.append(np.sqrt(np.diag(cov_logit)))
        std_err = np.array(std_err)

        #print(math.log(likelihood))

    if (fit_intercept == True):
        summary = pd.DataFrame({'features': ['intercept'] + feature_names})
        coef_trans = np.concatenate(([intercept], np.transpose(coefficients)),
                                    axis=0)

    else:
        summary = pd.DataFrame({'features': feature_names})
        coef_trans = np.transpose(coefficients)

    if not is_binary:
        summary = pd.concat(
            (summary, pd.DataFrame(coef_trans, columns=classes)), axis=1)
    else:
        summary = pd.concat(
            (summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1)
    if is_binary:
        summary = pd.concat(
            (summary, pd.DataFrame(std_err, columns=['standard_error']),
             pd.DataFrame(wald, columns=['wald_statistic']),
             pd.DataFrame(p_values, columns=['p_value'])),
            axis=1)
    else:
        columns = [
            'standard_error_{}'.format(classes[i]) for i in range(len(classes))
        ]
        summary = pd.concat(
            (summary, pd.DataFrame(std_err.T, columns=columns)), axis=1)
        arrange_col = ['features']
        for i in range(len(classes)):
            arrange_col.append(classes[i])
            arrange_col.append('standard_error_{}'.format(classes[i]))
        summary = summary[arrange_col]
    if is_binary:
        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""
        | ## Logistic Regression Result
        | ### Summary
        | {table1}
        |
        | ##### Column '{small}' is the coefficients under the assumption ({small} = 0, {big} = 1).
        |
        | #### AIC : {aic}
        |
        | #### BIC : {bic}
        """.format(small=classes[0],
                   big=classes[1],
                   table1=pandasDF2MD(summary, num_rows=100),
                   aic=aic,
                   bic=bic)))
    else:
        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""
        | ## Logistic Regression Result
        | ### Summary
        | {table1}
        |
        | ##### Each column whose name is one of classes of Label Column is the coefficients under the assumption it is 1 and others are 0.
        |
        | ##### For example, column '{small}' is the coefficients under the assumption ({small} = 1, others = 0).
        |
        | #### AIC : {aic}
        |
        | #### BIC : {bic}
        """.format(small=classes[0],
                   table1=pandasDF2MD(summary, num_rows=100),
                   aic=aic,
                   bic=bic)))

    model = _model_dict('logistic_regression_model')
    model['standard_errors'] = std_err
    model['aic'] = aic
    model['bic'] = bic
    if is_binary:
        model['wald_statistics'] = wald
        model['p_values'] = p_values
    model['features'] = feature_cols
    model['label'] = label_col
    model['intercept'] = lr_model.intercept_
    model['coefficients'] = lr_model.coef_
    model['class'] = lr_model.classes_
    model['penalty'] = penalty
    model['solver'] = solver
    model['lr_model'] = lr_model
    model['_repr_brtc_'] = rb.get()
    model['summary'] = summary
    return {'model': model}