def _svc_train(table,
               feature_cols,
               label_col,
               c=1.0,
               kernel='rbf',
               degree=3,
               gamma='auto',
               coef0=0.0,
               shrinking=True,
               probability=True,
               tol=1e-3,
               max_iter=-1,
               random_state=None):
    _table = table.copy()

    _feature_cols = _table[feature_cols]
    _label_col = _table[label_col]

    _svc = svm.SVC(C=c,
                   kernel=kernel,
                   degree=degree,
                   gamma=gamma,
                   coef0=coef0,
                   shrinking=shrinking,
                   probability=probability,
                   tol=tol,
                   max_iter=max_iter,
                   random_state=random_state)
    _svc_model = _svc.fit(_feature_cols, _label_col)

    get_param = _svc.get_params()
    get_param['feature_cols'] = feature_cols
    get_param['label_col'] = label_col

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## SVM Classification Result
    | ### Parameters
    | {table_parameter} 
    """.format(table_parameter=dict2MD(get_param))))

    _model = _model_dict('svc_model')
    _model['svc_model'] = _svc_model
    _model['features'] = feature_cols
    _model['report'] = rb.get()

    return {'model': _model}
Beispiel #2
0
def _hierarchical_clustering_post(table, model, num_clusters, cluster_col='prediction'):
    Z = model['model']
    input_cols = model['input_cols']
    params = model['parameters']
    out_table = model['outtable']
    predict = fcluster(Z, t=num_clusters, criterion='maxclust')
    out_table2 = table.copy()
    out_table2[cluster_col] = predict
    
    L, M = leaders(Z, predict)
    which_cluster = []
    for leader in L:
        if leader in Z[:, 0]:
            select_indices = np.where(Z[:, 0] == leader)[0][0]
            which_cluster.append(out_table['joined_column1'][select_indices])
        elif leader in Z[:, 1]:
            select_indices = np.where(Z[:, 1] == leader)[0][0]
            which_cluster.append(out_table['joined_column2'][select_indices])
    
    out_table3 = pd.DataFrame([])
    out_table3[cluster_col] = M
    out_table3['name_of_clusters'] = which_cluster
    out_table3 = out_table3.sort_values(cluster_col)
    cluster_count = np.bincount(out_table2[cluster_col])
    cluster_count = cluster_count[cluster_count != 0]
    # data = {'cluster_name': ['prediction' + str(i) for i in range(1, num_clusters + 1)]}
    out_table3['num_of_entities'] = list(cluster_count)
    
    rb = ReportBuilder()
    rb.addMD(strip_margin("""### Hierarchical Clustering Post Process Result"""))
    rb.addMD(strip_margin("""
    |### Parameters
    |
    |{display_params}
    |
    |## Clusters Information
    |
    |{out_table3}
    |
    """.format(display_params=dict2MD(params), out_table3=pandasDF2MD(out_table3))))

    model = _model_dict('hierarchical_clustering_post')
    model['report'] = rb.get()
    
    return {'out_table2' : out_table2, 'model': model}
Beispiel #3
0
def _outlier_detection_lof(table, input_cols, choice='add_prediction', n_neighbors=20, new_column_name='is_outlier'):  # algorithm='auto', leaf_size=30,
                          # metric='minkowski', p=2, contamination=0.1, 
    out_table = table.copy()
    lof_model = LocalOutlierFactor(n_neighbors, algorithm='auto', leaf_size=30, metric='minkowski', p=2, contamination=0.1)
    lof_model.fit_predict(out_table[input_cols])
    
    isinlier = lambda _: 'in' if _ == 1 else 'out'
    out_table[new_column_name] = [isinlier(lof_predict) for lof_predict in lof_model.fit_predict(out_table[input_cols])]
    
    if choice == 'add_prediction':
        pass
    elif choice == 'remove_outliers':
        out_table = out_table[out_table[new_column_name] == 'in']
        out_table = out_table.drop(new_column_name, axis=1)
    elif choice == 'both':
        out_table = out_table[out_table[new_column_name] == 'in']
    
    params = {
        'Input Columns': input_cols,
        'Result Type': choice,
        'Number of Neighbors': n_neighbors,
    #    'Algorithm': algorithm,
    #    'Metric': metric,
    #    'Contamination': contamination
    }
    
    rb = ReportBuilder()
    rb.addMD(strip_margin("""
    | ## Outlier Detection (Local Outlier Factor) Result
    | ### Parameters
    |
    | {display_params}
    """.format(display_params=dict2MD(params))))
    
    model = _model_dict('outlier_detection_lof')
    model['params'] = params
    model['lof_model'] = lof_model
    model['report'] = rb.get()
    
    return {'out_table':out_table, 'model':model}
Beispiel #4
0
def _pca(table,
         input_cols,
         new_column_name='projected_',
         n_components=None,
         copy=True,
         whiten=False,
         svd_solver='auto',
         tol=0.0,
         iterated_power='auto',
         random_state=None,
         hue=None,
         alpha=0,
         key_col=None):

    num_feature_cols = len(input_cols)
    if n_components is None:
        n_components = num_feature_cols

    pca = PCA(None, copy, whiten, svd_solver, tol, iterated_power,
              random_state)
    pca_model = pca.fit(table[input_cols])

    column_names = []
    for i in range(0, n_components):
        column_names.append(new_column_name + str(i))
    # print(column_names)

    pca_result = pca_model.transform(table[input_cols])
    out_df = pd.DataFrame(data=pca_result[:, :n_components],
                          columns=[column_names])

    out_df = pd.concat([table.reset_index(drop=True), out_df], axis=1)
    out_df.columns = table.columns.values.tolist() + column_names

    res_components = pca_model.components_
    res_components_df = pd.DataFrame(data=res_components[:n_components],
                                     columns=[input_cols])
    res_explained_variance = pca_model.explained_variance_
    res_explained_variance_ratio = pca_model.explained_variance_ratio_
    res_singular_values = pca_model.singular_values_
    res_mean = pca_model.mean_
    res_n_components = pca_model.n_components_
    res_noise_variance = pca_model.noise_variance_

    res_get_param = pca_model.get_params()
    res_get_covariance = pca_model.get_covariance()
    res_get_precision = pca_model.get_precision()

    # visualization
    plt.figure()
    if n_components == 1:
        sns.scatterplot(column_names[0], column_names[0], hue=hue, data=out_df)
        plt_two = plt2MD(plt)
        plt.clf()
    else:
        plt_two = _biplot(
            0,
            1,
            pc_columns=column_names,
            columns=input_cols,
            singular_values=res_singular_values,
            components=res_components,
            explained_variance_ratio=res_explained_variance_ratio,
            alpha=alpha,
            hue=hue,
            data=out_df,
            ax=plt.gca(),
            key_col=key_col)

    plt.figure()
    fig_scree = _screeplot(res_explained_variance,
                           res_explained_variance_ratio, n_components)

    table_explained_variance = pd.DataFrame(res_explained_variance,
                                            columns=['explained_variance'])
    table_explained_variance[
        'explained_variance_ratio'] = res_explained_variance_ratio
    table_explained_variance[
        'cum_explained_variance_ratio'] = res_explained_variance_ratio.cumsum(
        )

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## PCA Result
    | ### Plot
    | {image1}
    |
    | ### Explained Variance
    | {fig_scree}
    | {table_explained_variance}    
    |
    | ### Components
    | {table2}
    |
    | ### Parameters
    | {parameter1}
    """.format(image1=plt_two,
               fig_scree=fig_scree,
               table_explained_variance=pandasDF2MD(table_explained_variance),
               parameter1=dict2MD(res_get_param),
               table2=pandasDF2MD(res_components_df))))

    model = _model_dict('pca')
    model['components'] = res_components
    model['explained_variance'] = res_explained_variance
    model['explained_variance_ratio'] = res_explained_variance_ratio
    model['singular_values'] = res_singular_values
    model['mean'] = res_mean
    model['n_components'] = res_n_components
    model['noise_variance'] = res_noise_variance
    model['parameters'] = res_get_param
    model['covariance'] = res_get_covariance
    model['precision'] = res_get_precision
    model['report'] = rb.get()
    model['pca_model'] = pca_model
    model['input_cols'] = input_cols

    return {'out_table': out_df, 'model': model}
Beispiel #5
0
def _pca(table,
         input_cols,
         new_column_name='projected_',
         n_components=None,
         copy=True,
         whiten=False,
         svd_solver='auto',
         tol=0.0,
         iterated_power='auto',
         random_state=None):

    num_feature_cols = len(input_cols)
    if n_components is None:
        n_components = num_feature_cols

    pca = PCA(n_components, copy, whiten, svd_solver, tol, iterated_power,
              random_state)
    pca_model = pca.fit(table[input_cols])

    column_names = []
    for i in range(0, n_components):
        column_names.append(new_column_name + str(i))
    # print(column_names)

    pca_result = pca_model.transform(table[input_cols])
    out_df = pd.DataFrame(data=pca_result, columns=[column_names])

    res_components = pca_model.components_
    res_components_df = pd.DataFrame(data=res_components, columns=[input_cols])
    res_explained_variance = pca_model.explained_variance_
    res_explained_variance_ratio = pca_model.explained_variance_ratio_
    res_singular_values = pca_model.singular_values_
    res_mean = pca_model.mean_
    res_n_components = pca_model.n_components_
    res_noise_variance = pca_model.noise_variance_

    res_get_param = pca_model.get_params()
    res_get_covariance = pca_model.get_covariance()
    res_get_precision = pca_model.get_precision()

    # visualization
    plt.figure()
    if res_n_components == 1:
        plt.scatter(pca_result[:, 0], pca_result[:, 0])
    else:
        plt.scatter(pca_result[:, 0], pca_result[:, 1])
    # plt.title('PCA result with two components')
    # plt.show()
    plt_two = plt2MD(plt)
    plt.clf()

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | 
    | ### Plot
    | The x-axis and y-axis of the following plot is projected0 and projected1, respectively.    
    | {image1}
    |
    | ### Result
    | {table1}
    | only showing top 20 rows
    |
    | ### Parameters
    | {parameter1}
    |
    | ### Components
    | {table2}
    | 
    | ### Mean
    | {array1}
    | 
    | ### Explained Variance 
    | {array2}
    |
    """.format(table1=pandasDF2MD(out_df, 20),
               image1=plt_two,
               parameter1=dict2MD(res_get_param),
               table2=pandasDF2MD(res_components_df),
               array1=res_mean,
               array2=res_explained_variance)))

    model = _model_dict('pca')
    model['components'] = res_components
    model['explained_variance'] = res_explained_variance
    model['explained_variance_ratio'] = res_explained_variance_ratio
    model['singular_values'] = res_singular_values
    model['mean'] = res_mean
    model['n_components'] = res_n_components
    model['noise_variance'] = res_noise_variance
    model['parameters'] = res_get_param
    model['covariance'] = res_get_covariance
    model['precision'] = res_get_precision
    model['report'] = rb.get()
    model['pca_model'] = pca_model
    model['input_cols'] = input_cols

    out_df = pd.concat([table.reset_index(drop=True), out_df], axis=1)
    out_df.columns = table.columns.values.tolist() + column_names

    return {'out_table': out_df, 'model': model}
Beispiel #6
0
def paired_ttest(table,
                 first_column,
                 second_column,
                 alternative,
                 hypothesized_difference=0,
                 confidence_level=0.95):
    df = len(table) - 1
    diff_mean = abs(table[first_column] - table[second_column]).mean()
    std_dev = np.sqrt(
        ((diff_mean - abs(table[first_column] - table[second_column])) *
         (diff_mean - abs(table[first_column] - table[second_column]))).mean())
    ans = stats.ttest_rel(table[first_column],
                          table[second_column] + hypothesized_difference)
    t_value = ans[0]
    p_value_ul = ans[1]
    p_value_u = stats.t.sf(t_value, 149)
    p_value_l = stats.t.cdf(t_value, 149)

    left_u = diff_mean - std_dev * stats.t.isf(
        (1 - confidence_level), df) / np.sqrt(df)
    right_u = np.Infinity
    left_l = -np.Infinity
    right_l = diff_mean + std_dev * stats.t.isf(
        (1 - confidence_level), df) / np.sqrt(df)
    left_ul = diff_mean - std_dev * stats.t.isf(
        (1 - confidence_level) / 2, df) / np.sqrt(df)
    right_ul = diff_mean + std_dev * stats.t.isf(
        (1 - confidence_level) / 2, df) / np.sqrt(df)

    result_value_u = [{
        'data':
        first_column + " , " + second_column,
        'alternative_hypothesis':
        "true difference in means > " + str(hypothesized_difference),
        'statistics':
        "t statistics, t distribution with " + str(df) +
        " degrees of freedom under the null hypothesis",
        'estimates':
        t_value,
        'p_value':
        p_value_u,
        'confidence_level':
        confidence_level,
        'low_confidence_interval':
        left_u,
        'upper_confidence_interval':
        right_u
    }]
    result_value_l = [{
        'data':
        first_column + " , " + second_column,
        'alternative_hypothesis':
        "true difference in means < " + str(hypothesized_difference),
        'statistics':
        "t statistics, t distribution with " + str(df) +
        " degrees of freedom under the null hypothesis",
        'estimates':
        t_value,
        'p_value':
        p_value_l,
        'confidence_level':
        confidence_level,
        'low_confidence_interval':
        left_l,
        'upper_confidence_interval':
        right_l
    }]
    result_value_ul = [{
        'data':
        first_column + " , " + second_column,
        'alternative_hypothesis':
        "true difference in means != " + str(hypothesized_difference),
        'statistics':
        "t statistics, t distribution with " + str(df) +
        " degrees of freedom under the null hypothesis",
        'estimates':
        t_value,
        'p_value':
        p_value_ul,
        'confidence_level':
        confidence_level,
        'low_confidence_interval':
        left_ul,
        'upper_confidence_interval':
        right_ul
    }]

    df_result = pd.DataFrame()
    df_u = pd.DataFrame(result_value_u,
                        columns=[
                            'data', 'alternative_hypothesis', 'statistics',
                            'estimates', 'p_value', 'confidence_level',
                            'low_confidence_interval',
                            'upper_confidence_interval'
                        ])
    df_l = pd.DataFrame(result_value_l,
                        columns=[
                            'data', 'alternative_hypothesis', 'statistics',
                            'estimates', 'p_value', 'confidence_level',
                            'low_confidence_interval',
                            'upper_confidence_interval'
                        ])
    df_ul = pd.DataFrame(result_value_ul,
                         columns=[
                             'data', 'alternative_hypothesis', 'statistics',
                             'estimates', 'p_value', 'confidence_level',
                             'low_confidence_interval',
                             'upper_confidence_interval'
                         ])

    if 'greater' in alternative:
        df_result = df_result.append(df_u, ignore_index=True)
    if 'less' in alternative:
        df_result = df_result.append(df_l, ignore_index=True)
    if 'twosided' in alternative:
        df_result = df_result.append(df_ul, ignore_index=True)

    params = {
        'Input columns': first_column + ", " + second_column,
        'Hypothesized difference': str(hypothesized_difference),
        'Confidence level': str(confidence_level)
    }

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Paired T Test Result
    |
    |df|mean_difference|standard_deviation|t_value
    |--|--|--|--
    |{deg_f}|{dm}|{sd}|{tv}
    """.format(deg_f=df,
               dm=diff_mean,
               sd=std_dev,
               tv=t_value,
               params=dict2MD(params))))

    if 'greater' in alternative:
        rb.addMD(
            strip_margin("""
        | - H0 : true diffrence in means is less than or equal to {hd}.
        | - H1 : true diffrence in means is larger than {hd}.
        |
        |p_value|confidence_level|confidence_interval
        |--|--|--
        |{pvu}|{con_lv}|({l_u}, {r_u})
        |
        """.format(pvu=p_value_u,
                   hd=str(hypothesized_difference),
                   con_lv=str(confidence_level),
                   l_u=left_u,
                   r_u=right_u)))

    if 'less' in alternative:
        rb.addMD(
            strip_margin("""
        | - H0 : true diffrence in means is larger than or equal to {hd}.
        | - H1 : true diffrence in means is less than {hd}.
        |
        |p_value|confidence_level|confidence_interval
        |--|--|--
        |{pvl}|{con_lv}|({l_l}, {r_l})
        |
        """.format(pvl=p_value_l,
                   hd=str(hypothesized_difference),
                   con_lv=str(confidence_level),
                   l_l=left_l,
                   r_l=right_l)))

    if 'twosided' in alternative:
        rb.addMD(
            strip_margin("""
        | - H0 : true diffrence in means is equal to {hd}.
        | - H1 : true diffrence in means is not equal to {hd}.
        |
        |p_value|confidence_level|confidence_interval
        |--|--|--
        |{pvul}|{con_lv}|({l_ul}, {r_ul})
        |
        """.format(pvul=p_value_ul,
                   hd=str(hypothesized_difference),
                   con_lv=str(confidence_level),
                   l_ul=left_ul,
                   r_ul=right_ul)))

    model = dict()
    model['report'] = rb.get()

    return {'out_table': df_result, 'model': model}
def _decision_tree_classification_train(
        table,
        feature_cols,
        label_col,  # fig_size=np.array([6.4, 4.8]), 
        criterion='gini',
        splitter='best',
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        max_features=None,
        random_state=None,
        max_leaf_nodes=None,
        min_impurity_decrease=0.0,
        min_impurity_split=None,
        class_weight=None,
        presort=False,
        sample_weight=None,
        check_input=True,
        X_idx_sorted=None):
    classifier = DecisionTreeClassifier(
        criterion, splitter, max_depth, min_samples_split, min_samples_leaf,
        min_weight_fraction_leaf, max_features, random_state, max_leaf_nodes,
        min_impurity_decrease, min_impurity_split, class_weight, presort)
    classifier.fit(table[feature_cols], table[label_col], sample_weight,
                   check_input, X_idx_sorted)

    try:
        from sklearn.externals.six import StringIO
        from sklearn.tree import export_graphviz
        import pydotplus
        dot_data = StringIO()
        export_graphviz(classifier,
                        out_file=dot_data,
                        feature_names=feature_cols,
                        class_names=table[label_col].astype('str').unique(),
                        filled=True,
                        rounded=True,
                        special_characters=True)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        from brightics.common.report import png2MD
        fig_tree = png2MD(graph.create_png())
    except:
        fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer."

    # json
    model = _model_dict('decision_tree_classification_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    model['classes'] = classifier.classes_
    feature_importance = classifier.feature_importances_
    model['feature_importance'] = feature_importance
    model['max_features'] = classifier.max_features_
    model['n_classes'] = classifier.n_classes_
    model['n_features'] = classifier.n_features_
    model['n_outputs'] = classifier.n_outputs_
    model['tree'] = classifier.tree_
    get_param = classifier.get_params()
    model['parameters'] = get_param
    model['classifier'] = classifier

    # report

    indices = np.argsort(feature_importance)
    sorted_feature_cols = np.array(feature_cols)[indices]

    plt.title('Feature Importances')
    plt.barh(range(len(indices)),
             feature_importance[indices],
             color='b',
             align='center')
    for i, v in enumerate(feature_importance[indices]):
        plt.text(v,
                 i,
                 " {:.2f}".format(v),
                 color='b',
                 va='center',
                 fontweight='bold')
    plt.yticks(range(len(indices)), sorted_feature_cols)
    plt.xlabel('Relative Importance')
    plt.xlim(0, 1.1)
    plt.tight_layout()
    fig_feature_importances = plt2MD(plt)
    plt.clf()

    params = dict2MD(get_param)
    feature_importance_df = pd.DataFrame(data=feature_importance,
                                         index=feature_cols).T

    # Add tree plot

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Decision Tree Classification Train Result
    | ### Decision Tree
    | {fig_tree}
    |
    | ### Feature Importance
    | {fig_feature_importances}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_tree=fig_tree,
               fig_feature_importances=fig_feature_importances,
               list_parameters=params)))
    model['report'] = rb.get()

    return {'model': model}
Beispiel #8
0
def _kmeans_train_predict(table,
                          input_cols,
                          n_clusters=3,
                          prediction_col='prediction',
                          init='k-means++',
                          n_init=10,
                          max_iter=300,
                          tol=1e-4,
                          precompute_distances='auto',
                          seed=None,
                          n_jobs=1,
                          algorithm='auto',
                          n_samples=None):
    inputarr = table[input_cols]
    if n_samples is None:
        n_samples = len(inputarr)

    k_means = SKKMeans(n_clusters=n_clusters,
                       init=init,
                       n_init=n_init,
                       max_iter=max_iter,
                       tol=tol,
                       precompute_distances=precompute_distances,
                       verbose=0,
                       random_state=seed,
                       copy_x=True,
                       n_jobs=n_jobs,
                       algorithm=algorithm)

    k_means.fit(inputarr)

    params = {
        'input_cols': input_cols,
        'n_clusters': n_clusters,
        'init': init,
        'n_init': n_init,
        'max_iter': max_iter,
        'tol': tol,
        'precompute_distances': precompute_distances,
        'seed': seed,
        'n_jobs': n_jobs,
        'algorithm': algorithm
    }

    cluster_centers = k_means.cluster_centers_
    labels = k_means.labels_

    pca2_model = PCA(n_components=2).fit(inputarr)
    pca2 = pca2_model.transform(inputarr)

    fig_centers = _kmeans_centers_plot(input_cols, cluster_centers)
    fig_samples = _kmeans_samples_plot(table, input_cols, n_samples,
                                       cluster_centers)
    fig_pca = _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2)

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Kmeans Result
    | - Number of iterations run: {n_iter_}.
    | - Coordinates of cluster centers
    | {fig_cluster_centers} 
    | - Samples
    | {fig_pca}
    | {fig_samples}
    |
    | ### Parameters
    | {params}
    """.format(n_iter_=k_means.n_iter_,
               fig_cluster_centers=fig_centers,
               fig_pca=fig_pca,
               fig_samples=fig_samples,
               params=dict2MD(params))))

    model = _model_dict('kmeans')
    model['model'] = k_means
    model['input_cols'] = input_cols
    model['report'] = rb.get()

    out_table = table.copy()
    out_table[prediction_col] = labels
    return {'out_table': out_table, 'model': model}
def naive_bayes_train(table,
                      feature_cols,
                      label_col,
                      alpha=1.0,
                      fit_prior=True,
                      class_prior=None):

    features = table[feature_cols]
    label = table[label_col]
    label_encoder = preprocessing.LabelEncoder()
    label_encoder.fit(label)
    label_correspond = label_encoder.transform(label)

    if class_prior is not None:
        tmp_class_prior = [0 for x in range(len(class_prior))]
        for elems in class_prior:
            tmp = elems.split(":")
            tmp_class_prior[label_encoder.transform([tmp[0]
                                                     ])[0]] = float(tmp[1])
        class_prior = tmp_class_prior

    nb_model = MultinomialNB(alpha, fit_prior, class_prior)
    nb_model.fit(features, label_correspond)
    prediction_correspond = nb_model.predict(features)

    get_param = dict()
    get_param['Lambda'] = alpha
    # get_param['Prior Probabilities of the Classes'] = class_prior
    # get_param['Fit Class Prior Probability'] = fit_prior
    get_param['Feature Columns'] = feature_cols
    get_param['Label Column'] = label_col

    cnf_matrix = confusion_matrix(label_correspond, prediction_correspond)

    plt.figure()
    plot_confusion_matrix(cnf_matrix,
                          classes=label_encoder.classes_,
                          title='Confusion Matrix')
    fig_confusion_matrix = plt2MD(plt)
    accuracy = nb_model.score(features, label_correspond) * 100

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Naive Bayes Classification Result
    | ### Parameters
    | {table_parameter} 
    | ### Predicted vs Actual
    | {image1}
    | #### Accuacy = {accuracy}%
    |
    """.format(image1=fig_confusion_matrix,
               accuracy=accuracy,
               table_parameter=dict2MD(get_param))))

    model = _model_dict('naive_bayes_model')
    model['features'] = feature_cols
    model['label_col'] = label_col
    model['label_encoder'] = label_encoder
    model['nb_model'] = nb_model
    model['report'] = rb.get()

    return {'model': model}
Beispiel #10
0
def _outlier_detection_tukey_carling(table, input_cols, outlier_method="tukey", multiplier=None, number_of_removal=1,
                                    choice='add_prediction', new_column_prefix='is_outlier_'):
    out_table = table.copy()

    if multiplier is None and outlier_method == "tukey":
        multiplier = 1.5
    elif multiplier is None and outlier_method == "carling":
        multiplier = 2.3
    
    mean = table.mean()
    q1s = table.quantile(0.25)
    q3s = table.quantile(0.75)
    iqrs = q3s - q1s
    
    new_column_names = ['{prefix}{col}'.format(prefix=new_column_prefix, col=col) for col in input_cols]

    def _tukey(x, q1, q3, iqr, multiplier):
        return 'out' if x < q1 - multiplier * iqr or x > q3 + multiplier * iqr else 'in' 

    def _carling(x, mean, iqr, multiplier):
        return 'out' if x < mean - multiplier * iqr or x > mean + multiplier * iqr else 'in'
    
    if outlier_method == "tukey":
        for col in input_cols:
            output_col_name = '{prefix}{col}'.format(prefix=new_column_prefix, col=col)
            out_table[output_col_name] = table[col].apply(lambda _: _tukey(_, q1s[col], q3s[col], iqrs[col], multiplier))
            
    elif outlier_method == "carling":
        if multiplier is None:
            multiplier = 2.3
            
        for col in input_cols:
            output_col_name = '{prefix}{col}'.format(prefix=new_column_prefix, col=col)
            out_table[output_col_name] = table[col].apply(lambda _: _carling(_, mean[col], iqrs[col], multiplier))
        
    prediction = out_table[new_column_names].apply(lambda row: np.sum(row == 'out') < number_of_removal, axis=1)
    
    rb = ReportBuilder()
    params = { 
        'Input Columns': input_cols,
        'Outlier Method': outlier_method,
        'Multiplier': multiplier,
        'Number of Outliers in a Row': number_of_removal,
        'Result Type': choice,
        'New Column Prefix': new_column_prefix
    }
    rb.addMD(strip_margin("""
    | ## Outlier Detection (Tukey/Carling) Result
    | ### Parameters
    |
    | {display_params}
    """.format(display_params=dict2MD(params))))
    
    if choice == 'add_prediction':
        pass
    elif choice == 'remove_outliers':
        out_table = out_table.drop(new_column_names, axis=1)
        out_table = out_table[prediction.values]
    elif choice == 'both':
        out_table = out_table[prediction.values]
    
    model = _model_dict('outlier_detection_tukey_carling')
    model['params'] = params
    model['mean'] = mean
    model['q1'] = q1s
    model['q3'] = q3s
    model['iqr'] = iqrs
    model['multiplier'] = multiplier
    model['report'] = rb.get()
    
    return {'out_table': out_table, 'model' : model}
def _xgb_classification_train(table,
                              feature_cols,
                              label_col,
                              max_depth=3,
                              learning_rate=0.1,
                              n_estimators=100,
                              silent=True,
                              objective='binary:logistic',
                              booster='gbtree',
                              n_jobs=1,
                              nthread=None,
                              gamma=0,
                              min_child_weight=1,
                              max_delta_step=0,
                              subsample=1,
                              colsample_bytree=1,
                              colsample_bylevel=1,
                              reg_alpha=0,
                              reg_lambda=1,
                              scale_pos_weight=1,
                              base_score=0.5,
                              random_state=0,
                              seed=None,
                              missing=None,
                              sample_weight=None,
                              eval_set=None,
                              eval_metric=None,
                              early_stopping_rounds=None,
                              verbose=True,
                              xgb_model=None,
                              sample_weight_eval_set=None):
    classifier = XGBClassifier(max_depth, learning_rate, n_estimators, silent,
                               objective, booster, n_jobs, nthread, gamma,
                               min_child_weight, max_delta_step, subsample,
                               colsample_bytree, colsample_bylevel, reg_alpha,
                               reg_lambda, scale_pos_weight, base_score,
                               random_state, seed, missing)
    classifier.fit(table[feature_cols], table[label_col], sample_weight,
                   eval_set, eval_metric, early_stopping_rounds, verbose,
                   xgb_model, sample_weight_eval_set)

    # json
    get_param = classifier.get_params()
    feature_importance = classifier.feature_importances_
    #     plt.rcdefaults()
    plot_importance(classifier)
    plt.tight_layout()
    fig_plot_importance = plt2MD(plt)
    plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(classifier)
    #     fig_plot_tree_UT = plt2MD(plt)
    #     plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(classifier, rankdir='LR')
    #     fig_plot_tree_LR = plt2MD(plt)
    #     plt.rcdefaults()
    #     plt.clf()

    model = _model_dict('xgb_classification_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    model['parameters'] = get_param
    model['feature_importance'] = feature_importance
    model['classifier'] = classifier

    # report
    #     get_param_list = []
    #     get_param_list.append(['feature_cols', feature_cols])
    #     get_param_list.append(['label_col', label_col])

    params = dict2MD(get_param)
    #     for key, value in get_param.items():
    #         temp = [key, value]
    #         get_param_list.append(temp)
    #     get_param_df = pd.DataFrame(data=get_param_list, columns=['parameter', 'value'])
    feature_importance_df = pd.DataFrame(data=feature_importance,
                                         index=feature_cols).T

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## XGB Classification Train Result
    |
    | ### Plot Importance
    | {fig_importance}
    |
    | ### Feature Importance
    | {table_feature_importance}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_importance=fig_plot_importance,
               table_feature_importance=pandasDF2MD(feature_importance_df, 20),
               list_parameters=params)))
    model['report'] = rb.get()

    return {'model': model}
Beispiel #12
0
def _hierarchical_clustering(table, input_cols, link='complete', met='euclidean', p=2, num_rows=20, figure_height=6.4, orient='right'):
    table = table.copy()
    df = table[input_cols]
    Z = linkage(df, method=link, metric=met)
    out_table = pd.DataFrame([])
    out_table['linkage_step'] = [x + 1 for x in reversed(range(len(Z)))]
    out_table['joined_column1'] = ['pt_' + str(int(Z[:, 0][i])) for i in range(len(Z))]
    out_table['joined_column2'] = ['pt_' + str(int(Z[:, 1][i])) for i in range(len(Z))]
    out_table['name_of_clusters'] = ['CL_' + str(i + 1) for i in reversed(range(len(Z)))]
    out_table['distance'] = [distance for distance in Z[:, 2]]
    out_table['number_of_original'] = [int(entities) for entities in Z[:, 3]]
    
    # switch name of  point to cluster name

    for i in range(len(Z)):
        if Z[:, 0][i] >= len(df) :
            out_table['joined_column1'][i] = out_table['name_of_clusters'][Z[:, 0][i] - len(df)]
        if Z[:, 1][i] >= len(df) :
            out_table['joined_column2'][i] = out_table['name_of_clusters'][Z[:, 1][i] - len(df)]
    out_table = out_table.reindex(index=out_table.index[::-1])[0:]
    out_table1 = out_table.head(num_rows)
    
    # calculate full dendrogram
    def _llf(id):
        n = len(df)
        if id < n:
                return 'pt_' + str(id)
 
    plt.figure(figsize=(8.4, figure_height))
    _fancy_dendrogram(
        Z,
        truncate_mode='none',  # show only the last p merged clusters (if another)
        get_leaves=True,
        orientation=orient,
        labels=True,
        leaf_label_func=_llf,
        leaf_rotation=45,
        leaf_font_size=5.,
        show_contracted=False,  # to get a distribution impression in truncated branches
        annotate_above=float(10),  # useful in small plots so annotations don't overlap
        # max_d=distance_threshold, # will plot a horizontal cut-off line, max_d as in max_distance
    )
    plt.title('Hierarchical Clustering Dendrogram')
    if orient=='top':
        plt.xlabel('Samples')
        plt.ylabel('Distance')
    elif orient=='right':
        plt.xlabel('Distance')
        plt.ylabel('Samples')
    
    plt2 = plt2MD(plt)
    plt.clf()
    
    rb = ReportBuilder()
    params = { 
        'Input Columns': input_cols,
        'Linkage Method': link,
        'Metric': met,
        'Number of Rows in Linkage Matrix': num_rows
    }
    rb.addMD(strip_margin("""### Hierarchical Clustering Result"""))
    rb.addMD(strip_margin("""
    |## Dendrogram
    |
    |{image}
    |
    |### Parameters
    |
    | {display_params}
    |
    |## Linkage Matrix
    |
    |{out_table1}
    |
    """.format(image=plt2, display_params=dict2MD(params), out_table1=pandasDF2MD(out_table1))))

    model = _model_dict('hierarchical_clustering')
    model['model'] = Z
    model['input_cols'] = input_cols
    model['parameters'] = params
    model['outtable'] = out_table
    model['report'] = rb.get()
        
    return { 'model':model}
def _naive_bayes_train(table,
                       feature_cols,
                       label_col,
                       alpha=1.0,
                       fit_prior=True,
                       class_prior=None):

    features = table[feature_cols]
    label = table[label_col]
    label_encoder = preprocessing.LabelEncoder()
    label_encoder.fit(label)
    label_correspond = label_encoder.transform(label)

    if class_prior is not None:
        tmp_class_prior = [0 for x in range(len(class_prior))]
        for elems in class_prior:
            tmp = elems.split(":")
            tmp_class_prior[label_encoder.transform([tmp[0]
                                                     ])[0]] = float(tmp[1])
        class_prior = tmp_class_prior

    nb_model = MultinomialNB(alpha, fit_prior, class_prior)
    nb_model.fit(features, label_correspond)
    class_log_prior = nb_model.class_log_prior_
    feature_log_prob_ = nb_model.feature_log_prob_
    tmp_result = np.hstack(
        (list(map(list, zip(*[label_encoder.classes_] + [class_log_prior]))),
         (feature_log_prob_)))
    column_names = ['labels', 'pi']
    for feature_col in feature_cols:
        column_names += ['theta_' + feature_col]
    result_table = pd.DataFrame.from_records(tmp_result, columns=column_names)
    prediction_correspond = nb_model.predict(features)

    get_param = dict()
    get_param['Lambda'] = alpha
    # get_param['Prior Probabilities of the Classes'] = class_prior
    get_param['Fit Class Prior Probability'] = fit_prior
    get_param['Feature Columns'] = feature_cols
    get_param['Label Column'] = label_col

    cnf_matrix = confusion_matrix(label_correspond, prediction_correspond)

    plt.figure()
    _plot_confusion_matrix(cnf_matrix,
                           classes=label_encoder.classes_,
                           title='Confusion Matrix')
    fig_confusion_matrix = plt2MD(plt)
    accuracy = nb_model.score(features, label_correspond) * 100

    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Naive Bayes Classification Result
    |
    | ### Model:Multinomial
    | {result_table}
    | ### Parameters
    | {table_parameter} 
    | ### Predicted vs Actual
    | {image1}
    | #### Accuacy = {accuracy}%
    |
    """.format(image1=fig_confusion_matrix,
               accuracy=accuracy,
               result_table=pandasDF2MD(result_table),
               table_parameter=dict2MD(get_param))))

    model = _model_dict('naive_bayes_model')
    model['features'] = feature_cols
    model['label_col'] = label_col
    model['label_encoder'] = label_encoder
    model['nb_model'] = nb_model
    model['report'] = rb.get()

    return {'model': model}