Beispiel #1
0
def _xgb_regression_train(table,
                          feature_cols,
                          label_col,
                          max_depth=3,
                          learning_rate=0.1,
                          n_estimators=100,
                          silent=True,
                          objectibe='reg:linear',
                          booster='gbtree',
                          n_jobs=1,
                          nthread=None,
                          gamma=0,
                          min_child_weight=1,
                          max_delta_step=0,
                          subsample=1,
                          colsample_bytree=1,
                          colsample_bylevel=1,
                          reg_alpha=0,
                          reg_lambda=1,
                          scale_pos_weight=1,
                          base_score=0.5,
                          random_state=None,
                          seed=None,
                          missing=None,
                          sample_weight=None,
                          eval_set=None,
                          eval_metric=None,
                          early_stopping_rounds=None,
                          verbose=True,
                          xgb_model=None,
                          sample_weight_eval_set=None,
                          importance_type='gain'):

    if random_state is None:
        random_state = randint(-2**31, 2**31 - 1)

    regressor = XGBRegressor(max_depth=max_depth,
                             learning_rate=learning_rate,
                             n_estimators=n_estimators,
                             silent=silent,
                             objective=objectibe,
                             booster=booster,
                             n_jobs=n_jobs,
                             nthread=nthread,
                             gamma=gamma,
                             min_child_weight=min_child_weight,
                             max_delta_step=max_delta_step,
                             subsample=subsample,
                             colsample_bytree=colsample_bytree,
                             colsample_bylevel=colsample_bylevel,
                             reg_alpha=reg_alpha,
                             reg_lambda=reg_lambda,
                             scale_pos_weight=scale_pos_weight,
                             base_score=base_score,
                             random_state=random_state,
                             seed=seed,
                             missing=missing,
                             importance_type=importance_type)
    feature_names, features = check_col_type(table, feature_cols)
    label = table[label_col]
    regressor.fit(features, label, sample_weight, eval_set, eval_metric,
                  early_stopping_rounds, verbose, xgb_model,
                  sample_weight_eval_set)

    # json
    get_param = regressor.get_params()
    feature_importance = regressor.feature_importances_
    #     plt.rcdefaults()
    plot_importance(regressor)
    plt.tight_layout()
    fig_plot_importance = plt2MD(plt)
    plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(regressor)
    #     fig_plot_tree_UT = plt2MD(plt)
    #     plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(regressor, rankdir='LR')
    #     fig_plot_tree_LR = plt2MD(plt)
    #     plt.rcdefaults()
    #     plt.clf()

    out_model = _model_dict('xgb_regression_model')
    out_model['feature_cols'] = feature_cols
    out_model['label_col'] = label_col
    out_model['parameters'] = get_param
    out_model['feature_importance'] = feature_importance
    out_model['regressor'] = regressor
    out_model['plot_importance'] = fig_plot_importance
    #     out_model['plot_tree_UT'] = fig_plot_tree_UT
    #     out_model['plot_tree_LR'] = fig_plot_tree_LR
    #         out_model['to_graphviz'] = md_to_graphviz

    # report
    get_param_list = []
    get_param_list.append(['feature_cols', feature_names])
    get_param_list.append(['label_col', label_col])
    for key, value in get_param.items():
        temp = [key, value]
        get_param_list.append(temp)
    get_param_df = pd.DataFrame(data=get_param_list,
                                columns=['parameter', 'value'])
    feature_importance_df = pd.DataFrame(data=feature_importance,
                                         index=feature_names).T

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## XGB Regression Result
    |
    | ### Plot Feature Importance
    | {image_importance}
    |
    | ### Normalized Feature Importance Table
    | {table_feature_importance}
    |
    | ### Parameters
    | {table_parameter}
    |
    """.format(image_importance=fig_plot_importance,
               table_feature_importance=pandasDF2MD(feature_importance_df, 20),
               table_parameter=pandasDF2MD(get_param_df))))
    out_model['_repr_brtc_'] = rb.get()
    feature_importance_table = pd.DataFrame(
        [[feature_cols[i], feature_importance[i]]
         for i in range(len(feature_cols))],
        columns=['feature_name', 'importance'])
    out_model['feature_importance_table'] = feature_importance_table
    return {'model': out_model}
def _hierarchical_clustering(table,
                             input_cols,
                             input_mode,
                             key_col=None,
                             link='complete',
                             met='euclidean',
                             num_rows=20,
                             figure_height=6.4,
                             orient='right'):
    out_table = table.copy()
    features = out_table[input_cols]

    if input_mode == 'original':
        len_features = len(features)
        if key_col != None:
            data_names = list(out_table[key_col])
        elif key_col == None:
            data_names = ['pt_' + str(i) for i in range(len_features)]
        Z = linkage(features, method=link, metric=met)
    elif input_mode == 'matrix':
        len_features = len(input_cols)
        if key_col != None:
            data_names = []
            for column in input_cols:
                data_names.append(
                    out_table[key_col][out_table.columns.get_loc(column)])
        elif key_col == None:
            data_names = []
            for column in input_cols:
                data_names.append(
                    out_table.columns[out_table.columns.get_loc(column)])
        col_index = []
        for column in input_cols:
            col_index.append(out_table.columns.get_loc(column))
        dist_matrix = features.iloc[col_index]

        Z = linkage(dist_matrix, method=link, metric=met)
        dist_matrix['label'] = data_names
    else:
        raise_runtime_error("Please check 'input_mode'.")

    range_len_Z = range(len(Z))
    linkage_matrix = pd.DataFrame([])
    linkage_matrix['linkage_step'] = [x + 1 for x in reversed(range_len_Z)]
    linkage_matrix['name_of_clusters'] = [
        'CL_' + str(i + 1) for i in reversed(range_len_Z)
    ]
    joined_column1 = []
    for i in range_len_Z:
        if Z[:, 0][i] < len_features:
            joined_column1.append(data_names[int(Z[:, 0][i])])
        elif Z[:, 0][i] >= len_features:
            joined_column1.append(
                linkage_matrix['name_of_clusters'][Z[:, 0][i] - len_features])
    linkage_matrix['joined_column1'] = joined_column1
    joined_column2 = []
    for i in range_len_Z:
        if Z[:, 1][i] < len_features:
            joined_column2.append(data_names[int(Z[:, 1][i])])
        elif Z[:, 1][i] >= len_features:
            joined_column2.append(
                linkage_matrix['name_of_clusters'][Z[:, 1][i] - len_features])
    linkage_matrix['joined_column2'] = joined_column2

    linkage_matrix['distance'] = [distance for distance in Z[:, 2]]
    linkage_matrix['number_of_original'] = [
        int(entities) for entities in Z[:, 3]
    ]
    linkage_matrix = linkage_matrix.reindex(
        index=linkage_matrix.index[::-1])[0:]

    # calculate full dendrogram
    def _llf(idx):
        if idx < len_features:
            return 'pt_' + str(idx)

    plt.figure(figsize=(8.4, figure_height))
    _fancy_dendrogram(
        Z,
        truncate_mode=
        'none',  # show only the last p merged clusters (if another)
        get_leaves=True,
        orientation=orient,
        labels=data_names,
        # leaf_label_func=_llf,
        leaf_rotation=45,
        leaf_font_size=5.,
        show_contracted=
        False,  # to get a distribution impression in truncated branches
        annotate_above=float(
            10),  # useful in small plots so annotations don't overlap
    )
    plt.title('Hierarchical Clustering Dendrogram')
    if orient == 'top':
        plt.xlabel('Samples')
        plt.ylabel('Distance')
    elif orient == 'right':
        plt.xlabel('Distance')
        plt.ylabel('Samples')
    plt2 = plt2MD(plt)
    plt.clf()

    params = {
        'Input Columns': input_cols,
        'Linkage Method': link,
        'Metric': met,
        'Number of Rows in Linkage Matrix': num_rows
    }

    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""### Hierarchical Clustering Result"""))
    rb.addMD(
        strip_margin("""
    |## Dendrogram
    |
    |{image}
    |
    |### Parameters
    |
    | {display_params}
    |
    |## Linkage Matrix
    |
    |{out_table1}
    |
    """.format(image=plt2,
               display_params=dict2MD(params),
               out_table1=pandasDF2MD(linkage_matrix.head(num_rows)))))

    model = _model_dict('hierarchical_clustering')
    model['model'] = Z
    model['input_mode'] = input_mode
    if input_mode == 'matrix':
        model['dist_matrix'] = dist_matrix
    model['parameters'] = params
    model['linkage_matrix'] = linkage_matrix
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Beispiel #3
0
def _als_train(table,
               user_col,
               item_col,
               rating_col,
               mode='train',
               number=10,
               implicit=False,
               iterations=10,
               reg_param=0.1,
               rank=10,
               alpha=1.0,
               seed=None,
               targets=None):

    table_user_col = table[user_col]
    table_item_col = table[item_col]
    rating_col = table[rating_col]
    user_encoder = preprocessing.LabelEncoder()
    item_encoder = preprocessing.LabelEncoder()
    user_encoder.fit(table_user_col)
    item_encoder.fit(table_item_col)
    user_correspond = user_encoder.transform(table_user_col)
    item_correspond = item_encoder.transform(table_item_col)
    item_users = np.zeros(
        (len(item_encoder.classes_), len(user_encoder.classes_)))
    for i in range(len(table_user_col)):
        if implicit:
            item_users[item_correspond[i]][user_correspond[i]] = rating_col[i]
        else:
            if rating_col[i] == 0:
                item_users[item_correspond[i]][user_correspond[i]] = -1
            else:
                item_users[item_correspond[i]][
                    user_correspond[i]] = rating_col[i]

    item_users = csr_matrix(item_users)
    als_model = AlternatingLeastSquares(factors=rank,
                                        implicit=implicit,
                                        iterations=iterations,
                                        regularization=reg_param,
                                        alpha=alpha,
                                        seed=seed)
    als_model.fit(item_users)
    tmp_col = list(als_model.user_factors)
    for i in range(len(tmp_col)):
        tmp_col[i] = list(tmp_col[i])
    user_factors = pd.DataFrame(user_encoder.classes_, columns=[user_col])
    user_factors['features'] = tmp_col
    tmp_col = list(als_model.item_factors)
    for i in range(len(tmp_col)):
        tmp_col[i] = list(tmp_col[i])
    item_factors = pd.DataFrame(item_encoder.classes_, columns=[item_col])
    item_factors['features'] = tmp_col
    if mode == 'Topn':
        if targets is None:
            targets = user_encoder.classes_
        targets_en = user_encoder.transform(targets)
        user_items = item_users.T.tocsr()
        Topn_result = []
        for user in targets_en:
            recommendations_corre = als_model.recommend(
                user, user_items, number)
            recommendations = []
            for (item, rating) in recommendations_corre:
                recommendations += [
                    item_encoder.inverse_transform([item])[0], rating
                ]
            Topn_result += [recommendations]
        Topn_result = pd.DataFrame(Topn_result)
        Topn_result = pd.concat([pd.DataFrame(targets), Topn_result],
                                axis=1,
                                ignore_index=True)
        column_names = ['user']
        for i in range(number):
            column_names += ['item_top%d' % (i + 1), 'rating_top%d' % (i + 1)]
        Topn_result.columns = column_names
        return {'out_table': Topn_result}

    parameters = dict()
    parameters['Iterations'] = iterations
    parameters['Reg Param'] = reg_param
    parameters['Seed'] = seed
    parameters['Rank'] = rank
    if implicit:
        parameters['alpha'] = alpha
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## ALS Train Result
    |
    | ### Parameters
    | {parameters} 
    | ### Item Factors
    | {item_factors}
    | ### User Factors
    | {user_factors}
    |
    """.format(item_factors=pandasDF2MD(item_factors,
                                        num_rows=item_users.shape[0]),
               user_factors=pandasDF2MD(user_factors,
                                        num_rows=item_users.shape[1]),
               parameters=dict2MD(parameters))))

    model = _model_dict('ALS')
    model['als_model'] = als_model
    model['item_encoder'] = item_encoder
    model['user_encoder'] = user_encoder
    model['user_col'] = user_col
    model['item_col'] = item_col
    model['user_factors'] = user_factors
    model['item_factors'] = item_factors
    model['_repr_brtc_'] = rb.get()
    return {'model': model}
Beispiel #4
0
def _lda(table,
         input_col,
         num_voca=1000,
         num_topic=3,
         num_topic_word=3,
         max_iter=20,
         learning_method='online',
         learning_offset=10.,
         random_state=None):
    corpus = table[input_col]
    tf_vectorizer = CountVectorizer(max_df=0.95,
                                    min_df=2,
                                    max_features=num_voca,
                                    stop_words='english')
    term_count = tf_vectorizer.fit_transform(corpus)
    tf_feature_names = tf_vectorizer.get_feature_names()

    if learning_method == 'online':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            learning_offset=learning_offset,
            random_state=random_state).fit(term_count)
    elif learning_method == 'batch':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            random_state=random_state).fit(term_count)
    else:
        raise_runtime_error("Please check 'learning_method'.")

    topic_model = pd.DataFrame([])
    topic_idx_list = []
    voca_weights_list = []
    for topic_idx, weights in enumerate(lda_model.components_):
        topic_idx_list.append("Topic {}".format(topic_idx))
        pairs = []
        for term_idx, value in enumerate(weights):
            pairs.append((abs(value), tf_feature_names[term_idx]))
        pairs.sort(key=lambda x: x[0], reverse=True)
        voca_weights = []
        for pair in pairs[:num_topic_word]:
            voca_weights.append("{}: {}".format(pair[1], pair[0]))
        voca_weights_list.append(voca_weights)
    topic_model['topic idx'] = topic_idx_list
    topic_model['topic vocabularies'] = voca_weights_list

    doc_topic = lda_model.transform(term_count)

    doc_classification = pd.DataFrame()
    doc_classification['documents'] = [doc for doc in corpus]
    doc_classification['top topic'] = [
        "Topic {}".format(doc_topic[i].argmax()) for i in range(len(corpus))
    ]

    params = {
        'Input Column': input_col,
        'Number of Vocabularies': num_voca,
        'Number of Topics': num_topic,
        'Number of Terminologies': num_topic_word,
        'Iterations': max_iter,
        'Learning Method': learning_method,
    }

    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""# Latent Dirichlet Allocation Result"""))
    rb.addMD(
        strip_margin("""
    |
    |### Parameters
    |
    | {display_params}
    |
    |### Topic Model
    |
    |{topic_model}
    |
    |### Documents Classification
    |
    |{doc_classification}
    |
    """.format(display_params=dict2MD(params),
               topic_model=pandasDF2MD(topic_model, num_rows=num_topic + 1),
               doc_classification=pandasDF2MD(doc_classification,
                                              num_rows=len(corpus) + 1))))

    model = _model_dict('lda')
    model['parameter'] = params
    model['topic_model'] = topic_model
    model['documents_classification'] = doc_classification
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Beispiel #5
0
def _evaluate_classification(table, label_col, prediction_col):

    label = table[label_col]
    predict = table[prediction_col]

    # compute metrics
    accuracy = accuracy_score(label, predict)
    f1 = f1_score(label, predict, average="weighted")
    precision = precision_score(label, predict, average="weighted")
    recall = recall_score(label, predict, average="weighted")
    class_names = np.unique(np.union1d(label.values, predict.values))

    # Plot non-normalized confusion matrix
    plt.figure()
    _plot_confusion_matrix(label,
                           predict,
                           classes=class_names,
                           title='Confusion matrix, without normalization')
    fig_cnf_matrix = plt2MD(plt)
    # Plot normalized confusion matrix
    plt.figure()
    _plot_confusion_matrix(label,
                           predict,
                           classes=class_names,
                           normalize=True,
                           title='Normalized confusion matrix')
    fig_cnf_matrix_normalized = plt2MD(plt)
    plt.clf()

    # json
    summary = dict()
    summary['label_col'] = label_col
    summary['prediction_col'] = prediction_col
    summary['f1_score'] = f1
    summary['accuracy_score'] = accuracy
    summary['precision_score'] = precision
    summary['recall_score'] = recall

    # report
    all_dict_list = [{
        'f1': f1,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall
    }]
    all_df = pd.DataFrame(all_dict_list)
    all_df = all_df[['f1', 'accuracy', 'precision', 'recall']]
    summary['metrics'] = all_df

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Evaluate Classification Result
    | ### Metrics
    | {table1}
    |
    | ### Confusion matrix
    | {fig_confusion_matrix}
    |
    | {fig_confusion_matrix_normalized}
    |
    """.format(table1=pandasDF2MD(all_df),
               fig_confusion_matrix=fig_cnf_matrix,
               fig_confusion_matrix_normalized=fig_cnf_matrix_normalized)))
    summary['_repr_brtc_'] = rb.get()

    return {'result': summary}
def _xgb_classification_train(table,
                              feature_cols,
                              label_col,
                              max_depth=3,
                              learning_rate=0.1,
                              n_estimators=100,
                              silent=True,
                              objective='binary:logistic',
                              booster='gbtree',
                              n_jobs=1,
                              nthread=None,
                              gamma=0,
                              min_child_weight=1,
                              max_delta_step=0,
                              subsample=1,
                              colsample_bytree=1,
                              colsample_bylevel=1,
                              reg_alpha=0,
                              reg_lambda=1,
                              scale_pos_weight=1,
                              base_score=0.5,
                              random_state=0,
                              seed=None,
                              missing=None,
                              importance_type='gain',
                              class_weight=None,
                              eval_set=None,
                              eval_metric=None,
                              early_stopping_rounds=None,
                              verbose=True,
                              xgb_model=None,
                              sample_weight_eval_set=None):

    y_train = table[label_col]
    class_labels = sorted(set(y_train))
    if class_weight is None:
        sample_weight = None
    else:
        if len(class_weight) != len(class_labels):
            raise ValueError(
                "Number of class weights should match number of labels.")
        else:
            class_weight = {
                class_labels[i]: class_weight[i]
                for i in range(len(class_labels))
            }
            sample_weight = np.vectorize(_make_sample_weight)(y_train,
                                                              class_weight)

    classifier = XGBClassifier(max_depth=max_depth,
                               learning_rate=learning_rate,
                               n_estimators=n_estimators,
                               silent=silent,
                               objective=objective,
                               booster=booster,
                               n_jobs=n_jobs,
                               nthread=nthread,
                               gamma=gamma,
                               min_child_weight=min_child_weight,
                               max_delta_step=max_delta_step,
                               subsample=subsample,
                               colsample_bytree=colsample_bytree,
                               colsample_bylevel=colsample_bylevel,
                               reg_alpha=reg_alpha,
                               reg_lambda=reg_lambda,
                               scale_pos_weight=scale_pos_weight,
                               base_score=base_score,
                               random_state=random_state,
                               seed=seed,
                               missing=missing,
                               importance_type=importance_type)

    classifier.fit(table[feature_cols], table[label_col], sample_weight,
                   eval_set, eval_metric, early_stopping_rounds, verbose,
                   xgb_model, sample_weight_eval_set)

    # json
    get_param = classifier.get_params()
    feature_importance = classifier.feature_importances_
    #     plt.rcdefaults()
    plot_importance(classifier)
    plt.tight_layout()
    fig_plot_importance = plt2MD(plt)
    plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(classifier)
    #     fig_plot_tree_UT = plt2MD(plt)
    #     plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(classifier, rankdir='LR')
    #     fig_plot_tree_LR = plt2MD(plt)
    #     plt.rcdefaults()
    #     plt.clf()

    model = _model_dict('xgb_classification_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    model['parameters'] = get_param
    model['feature_importance'] = feature_importance
    model['classifier'] = classifier

    # report
    #     get_param_list = []
    #     get_param_list.append(['feature_cols', feature_cols])
    #     get_param_list.append(['label_col', label_col])

    params = dict2MD(get_param)
    #     for key, value in get_param.items():
    #         temp = [key, value]
    #         get_param_list.append(temp)
    #     get_param_df = pd.DataFrame(data=get_param_list, columns=['parameter', 'value'])
    feature_importance_df = pd.DataFrame(data=feature_importance,
                                         index=feature_cols).T

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## XGB Classification Train Result
    |
    | ### Plot Feature Importance
    | {fig_importance}
    |
    | ### Normalized Feature Importance Table
    | {table_feature_importance}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_importance=fig_plot_importance,
               table_feature_importance=pandasDF2MD(feature_importance_df, 20),
               list_parameters=params)))
    model['_repr_brtc_'] = rb.get()
    feature_importance_table = pd.DataFrame(
        [[feature_cols[i], feature_importance[i]]
         for i in range(len(feature_cols))],
        columns=['feature_name', 'importance'])
    model['feature_importance_table'] = feature_importance_table
    return {'model': model}
Beispiel #7
0
def _autocorrelation(table, input_col, nlags=20, conf_level=0.95):
    data = table[input_col]

    plt.figure()
    plot_acf(data, lags=nlags, alpha=1 - conf_level)
    fig_plt_acf = plt2MD(plt)
    plt.clf()

    plt.figure()
    plot_pacf(data, lags=nlags, alpha=1 - conf_level)
    fig_plt_pacf = plt2MD(plt)
    plt.clf()

    acf_ret = acf(data, nlags=nlags, alpha=1 - conf_level)
    pacf_ret = pacf(data, nlags=nlags, alpha=1 - conf_level)

    result_table1 = pd.DataFrame([])
    result_table1['lag'] = list(range(nlags + 1))
    result_table1['ACF'] = acf_ret[0]

    if conf_level is not None:
        result_table1['%g%% confidence Interval' % (conf_level * 100)] = [
            str((acf_ret[1][i][0], acf_ret[1][i][1])) for i in range(nlags + 1)
        ]

    result_table2 = pd.DataFrame([])
    result_table2['lag'] = list(range(nlags + 1))
    result_table2['PACF'] = pacf_ret[0]

    if conf_level is not None:
        result_table2['%g%% confidence Interval' % (conf_level * 100)] = [
            str((pacf_ret[1][i][0], pacf_ret[1][i][1]))
            for i in range(nlags + 1)
        ]

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""# Autocorrelation / Partial Autocorrelation Result"""))
    rb.addMD(
        strip_margin("""
    |## Autocorrelation
    |
    |{image1}
    |
    |### Autocorrelation Table
    |
    |{result_table1}
    |
    |## Partial Autocorrelation
    |
    |{image2}
    |
    |### Partial Autocorrelation Table
    |
    |{result_table2}
    |
    """.format(image1=fig_plt_acf,
               result_table1=pandasDF2MD(result_table1, num_rows=nlags + 1),
               image2=fig_plt_pacf,
               result_table2=pandasDF2MD(result_table2, num_rows=nlags + 1))))

    model = _model_dict('autocorrelation')
    model['autocorrelation_table'] = result_table1
    model['partial_autocorrelation_table'] = result_table2
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Beispiel #8
0
def _gaussian_mixture_train(table, input_cols, number_of_components=1, covariance_type='full', tolerance=0.001, \
                            regularize_covariance=1e-06, max_iteration=100, initial_params='kmeans', seed=None):

    gmm = GaussianMixture(n_components=number_of_components, covariance_type=covariance_type, tol=tolerance, \
                          reg_covar=regularize_covariance, max_iter=max_iteration, init_params=initial_params, random_state=seed)
    feature_names, X_train = check_col_type(table, input_cols)
    gmm.fit(X_train)

    out_table = pd.DataFrame()

    comp_num_arr = []
    for i in range(0, number_of_components):
        comp_num_arr.append(i)

    mean_arr = []
    for i in range(0, number_of_components):
        mean_arr.append(gmm.means_[i].tolist())

    covar_arr = []
    for i in range(0, number_of_components):
        covar_arr.append(gmm.covariances_[i].tolist())

    out_table['component_number'] = comp_num_arr
    out_table['weight'] = gmm.weights_
    out_table['mean_coordinate'] = mean_arr
    out_table['covariance_matrix'] = covar_arr

    rb = BrtcReprBuilder()
    params = {
        'Input Columns': feature_names,
        'Number of Components': number_of_components,
        'Covariance Type': covariance_type,
        'Tolerance': tolerance,
        'Regularization of Covariance': regularize_covariance,
        'Number of Iteration': max_iteration,
        'Method to Initialize': initial_params
    }

    rb.addMD(
        strip_margin("""
    |## Gaussian Mixture Train Result 
    |
    |### Parameters
    |
    | {params}
    |
    |### Summary
    |
    |{result_table}
    |
    """.format(params=dict2MD(params), result_table=pandasDF2MD(out_table))))

    model = _model_dict('gaussian_mixture_train')
    model['input_cols'] = input_cols
    model['number_of_components'] = number_of_components
    model['covariance_type'] = covariance_type
    model['tolerance'] = tolerance
    model['regularize_covariance'] = regularize_covariance
    model['max_iteration'] = max_iteration
    model['initial_params'] = initial_params
    model['seed'] = seed
    model['summary'] = out_table
    model['gmm'] = gmm
    model['_repr_brtc_'] = rb.get()
    return {'model': model}
Beispiel #9
0
def _mean_shift(table, input_cols, prediction_col='prediction', bandwidth=None, bin_seeding=False, min_bin_freq=1, cluster_all=True):
    inputarr = table[input_cols]
        
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=bin_seeding, min_bin_freq=min_bin_freq, cluster_all=cluster_all, n_jobs=1)
    
    ms.fit(inputarr)
    

    label_name = {
        'bandwidth': 'Bandwidth',
        'bin_seeding': 'Bin Seeding',
        'min_bin_freq': 'Minimum Bin Frequency',
        'cluster_all': 'Cluster All'}
    get_param = ms.get_params()
    param_table = pd.DataFrame.from_items([
        ['Parameter', list(label_name.values())],
        ['Value', [get_param[x] for x in list(label_name.keys())]]
    ])
    
    cluster_centers = ms.cluster_centers_
    n_clusters = len(cluster_centers)
    colors = cm.nipy_spectral(np.arange(n_clusters).astype(float) / n_clusters)
    labels = ms.labels_
    
    if len(input_cols) > 1:
        pca2_model = PCA(n_components=2).fit(inputarr)
        pca2 = pca2_model.transform(inputarr)
    
    fig_centers = _mean_shift_centers_plot(input_cols, cluster_centers, colors)
    fig_samples = _mean_shift_samples_plot(table, input_cols, 100,cluster_centers, colors) if len(table.index) > 100 else _mean_shift_samples_plot(table, input_cols, None, cluster_centers, colors)
    
    if len(input_cols) > 1:
        fig_pca = _mean_shift_pca_plot(labels, cluster_centers, pca2_model, pca2, colors)
        rb = BrtcReprBuilder()
        rb.addMD(strip_margin("""
        | ## Mean Shift Result
        | - Coordinates of cluster centers
        | {fig_cluster_centers} 
        | - Samples
        | {fig_pca}
        | {fig_samples}
        | ### Parameters
        | {params}
        """.format(fig_cluster_centers=fig_centers, fig_pca=fig_pca, fig_samples=fig_samples, params=pandasDF2MD(param_table))))
    else:
        rb = BrtcReprBuilder()
        rb.addMD(strip_margin("""
        | ## Mean Shift Result
        | - Coordinates of cluster centers
        | {fig_cluster_centers} 
        | - Samples
        | {fig_samples}
        | ### Parameters
        | {params}
        """.format(fig_cluster_centers=fig_centers, fig_samples=fig_samples, params=pandasDF2MD(param_table))))
    
    model = _model_dict('mean_shift')
    model['model'] = ms
    model['input_cols'] = input_cols
    model['_repr_brtc_'] = rb.get()
    
    out_table = table.copy()
    out_table[prediction_col] = labels
    return {'out_table':out_table, 'model':model}
Beispiel #10
0
def _correlation(table, vars, method='pearson', height=2.5, corr_prec=2):

    validate(greater_than(height, 0, 'height'),
             greater_than_or_equal_to(corr_prec, 1, 'corr_prec'))

    size = len(vars)

    s_default = plt.rcParams['lines.markersize']**2.
    scatter_kws = {"s": s_default * height / 6.4}

    result_arr = []

    for i in range(size):
        for j in range(i):
            if method == 'pearson':
                r, p = stats.pearsonr(table[vars[i]], table[vars[j]])
            elif method == 'spearman':
                r, p = stats.spearmanr(table[vars[i]], table[vars[j]])
            elif method == 'kendal':
                r, p = stats.kendalltau(table[vars[i]], table[vars[j]])

            result_arr.append([vars[i], vars[j], r, p])

    df_result = pd.DataFrame(result_arr, columns=['x', 'y', 'corr', 'p_value'])

    def corr(x, y, **kwargs):
        if kwargs['method'] == 'pearson':
            r, p = stats.pearsonr(x, y)
        elif kwargs['method'] == 'spearman':
            r, p = stats.spearmanr(x, y)
        elif kwargs['method'] == 'kendal':
            r, p = stats.kendalltau(x, y)

        p_stars = ''
        if p <= 0.05:
            p_stars = '*'
        if p <= 0.01:
            p_stars = '**'
        if p <= 0.001:
            p_stars = '***'

        corr_text = '{:.{prec}f}'.format(r, prec=corr_prec)
        font_size = abs(r) * 15 * 2 / corr_prec + 5
        ax = plt.gca()
        ax.annotate(corr_text, [
            .5,
            .5,
        ],
                    xycoords="axes fraction",
                    ha='center',
                    va='center',
                    fontsize=font_size * height)
        ax.annotate(p_stars,
                    xy=(0.65, 0.6),
                    xycoords=ax.transAxes,
                    color='red',
                    fontsize=17 * height)

    g = sns.PairGrid(table, vars=vars, height=height)
    g.map_diag(sns.distplot)
    if method == 'pearson':
        g.map_lower(sns.regplot, scatter_kws=scatter_kws)
    else:
        g.map_lower(sns.regplot, lowess=True, scatter_kws=scatter_kws)
    g.map_upper(corr, method=method)

    fig_corr = plt2MD(plt)
    plt.clf()

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin(""" ## Correlation Results
        | ### Correlation Matrix
        | {fig_corr}
        |
        | ### Correlation Table
        | {table}
        """.format(fig_corr=fig_corr, table=pandasDF2MD(df_result))))

    params = {'vars': vars, 'method': method, 'height': height}

    res = dict()
    res['params'] = params
    res['corr_table'] = df_result
    res['_repr_brtc_'] = rb.get()

    return {'result': res}
Beispiel #11
0
def _chi_square_test_of_independence(table,
                                     feature_cols,
                                     label_col,
                                     correction=False):

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Chi-square Test of Independence Result
    |  - H0: the two categorical variables are independent.
    |  - H1: the two categorical variables are dependent.
    """))

    model = _model_dict('chi_square_test_of_independence')

    for idx, feature_col in enumerate(feature_cols):
        contingency_table = pd.crosstab(table[feature_col],
                                        table[label_col],
                                        margins=True)
        feature_index = len(contingency_table) - 1
        label_index = len(contingency_table.columns) - 1
        temporary = contingency_table.iloc[0:feature_index, 0:label_index]

        test = stats.chi2_contingency(np.array(temporary), correction, 1)
        stat_chi = test[0]
        dof = test[2]
        p_chi = test[1]

        if p_chi < 0.05:
            dependence = 'Reject the null hypothesis that two categorical variables are independent at 5% significance level.'
        elif p_chi >= 0.05:
            dependence = 'No association was found between two categorical variables at 5% significance level.'
        elif math.isnan(p_chi):
            dependence = 'Independence of two categorical variables cannot be decided.'

        data = {'estimate': stat_chi, 'df': dof, 'p_value': p_chi}

        result_table = pd.DataFrame([data],
                                    columns=['estimate', 'df', 'p_value'])

        model['result{}'.format(idx)] = result_table

        rb.addMD(
            strip_margin("""
        |### Label: {label}, Feature: {feature}
        |###### Result Table {idx}
        |  
        |{result_table}
        |
        |{dependence}
        |
        |
        """.format(label=label_col,
                   feature=feature_col,
                   idx=idx,
                   result_table=pandasDF2MD(result_table),
                   dependence=dependence)))

    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Beispiel #12
0
def _one_sample_ttest_repr(statistics, result_dict, params):
    input_cols = params['input_cols']
    alternatives = params['alternatives']
    hypothesized_mean = params['hypothesized_mean']
    conf_level = params['conf_level']
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## One Sample T Test Result
    | - Statistics = {s}
    | - Hypothesized mean = {h} 
    | - Confidence level = {cl}
    """.format(s=statistics, h=hypothesized_mean, cl=conf_level)))
                             
    for input_col in input_cols:
        H1_list = []
        p_list = []
        CI_list = []
        for alter in alternatives:
            test_info = result_dict[input_col][alter]
            H1_list.append(test_info['alternative_hypothesis'])
            p_list.append(test_info['p_value'])
            CI_list.append(test_info['confidence_interval'])
            
        result_table = pd.DataFrame.from_items([ 
            ['alternative hypothesis', H1_list],
            ['p-value', p_list],
            ['%g%% confidence Interval' % (conf_level * 100), CI_list]
        ])  
        
        rb.addMD(strip_margin("""
        | ### Data = {input_col}
        | - t-value = {t_value} 
        |
        | {result_table}
        """.format(input_col=input_col, t_value=result_dict[input_col]['t_value'], result_table=pandasDF2MD(result_table))))    
    
    rb.addMD(strip_margin("""
        | ### Parameters
        | {params}
        """.format(params=dict2MD(params))))
    
    return rb      
Beispiel #13
0
def _two_sample_ttest_for_stacked_data(table, response_cols, factor_col, alternatives, first=None , second=None , hypo_diff=0, equal_vari='pooled', confi_level=0.95):
    if first is not None or second is not None:
        check_table = np.array(table[factor_col])
        for element in check_table:
            if element is not None:
                if type(element) != str:
                    if type(element) == bool:
                        if first is not None and second is not None:
                            first = bool(first)
                            second = bool(second)
                            break
                        if first is not None:
                            first = bool(first)
                            break
                        second = bool(second)
                        break
                    else:
                        if first is not None and second is not None:
                            first = float(first)
                            second = float(second)
                            break
                        if first is not None:
                            first = float(first)
                            break
                        second = float(second)
                        break
                else:
                    break
    if first is None or second is None:
        tmp_factors=np.unique(table[factor_col])
        if len(tmp_factors) != 2:
            raise_error('0719', 'factor_col')
    if first is None:
        if tmp_factors[0] != second:
            first = tmp_factors[0]
        else:
            first = tmp_factors[1]
    if second is None:
        if tmp_factors[0] != first:
            second = tmp_factors[0]
        else:
            second = tmp_factors[1]
    table_first = table[table[factor_col] == first]
    table_second = table[table[factor_col] == second]
    tmp_table = []

    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    ## Two Sample T Test for Stacked Data Result
    | - Hypothesized mean = {hypo_diff}
    | - Confidence level = {confi_level}
    """.format(hypo_diff=hypo_diff, confi_level=confi_level)))

    for response_col in response_cols:
        tmp_model = []
        number1 = len(table_first[response_col])
        number2 = len(table_second[response_col])
        mean1 = (table_first[response_col]).mean()
        mean2 = (table_second[response_col]).mean()
        std1 = (table_first[response_col]).std()
        std2 = (table_second[response_col]).std()
        start_auto = 0
        if equal_vari == 'auto':
            start_auto = 1
            f_value = (std1 ** 2) / (std2 ** 2)
            f_test_p_value_tmp = stats.f.cdf(1 / f_value, number1 - 1, number2 - 1)
            if f_test_p_value_tmp > 0.5:
                f_test_p_value = (1 - f_test_p_value_tmp) * 2
            else:
                f_test_p_value = f_test_p_value_tmp * 2
            if f_test_p_value < 0.05:
                equal_vari = 'unequal'
            else:
                equal_vari = 'pooled'
        ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'larger', usevar=equal_vari, value=hypo_diff)

        if 'larger' in alternatives:
            ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'larger', usevar=equal_vari, value=hypo_diff)
            df = ttestresult[2]
            if equal_vari == 'pooled':
                std_number1number2 = sqrt(((number1 - 1) * (std1) ** 2 + (number2 - 1) * (std2) ** 2) / (number1 + number2 - 2))
                margin = t.ppf((confi_level) , df) * std_number1number2 * sqrt(1 / number1 + 1 / number2)
            if equal_vari == 'unequal':
                margin = t.ppf((confi_level) , df) * sqrt(std1 ** 2 / (number1) + std2 ** 2 / (number2))
            tmp_model += [['true difference in means > {}'.format(hypo_diff)] + 
            [ttestresult[1]] + [(mean1 - mean2 - margin, math.inf)]]
            tmp_table += [['%s by %s(%s,%s)' % (response_col, factor_col, first, second)] + 
            ['true difference in means > {}'.format(hypo_diff)] + 
            ['t statistic, t distribution with %f degrees of freedom under the null hypothesis' % ttestresult[2]] + 
            [ttestresult[0]] + [ttestresult[1]] + [confi_level] + [mean1 - mean2 - margin] + [math.inf]]

        if 'smaller' in alternatives:
            ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'smaller', usevar=equal_vari, value=hypo_diff)
            df = ttestresult[2]
            if equal_vari == 'pooled':    
                std_number1number2 = sqrt(((number1 - 1) * (std1) ** 2 + (number2 - 1) * (std2) ** 2) / (number1 + number2 - 2))
                margin = t.ppf((confi_level) , df) * std_number1number2 * sqrt(1 / number1 + 1 / number2)
            if equal_vari == 'unequal':
                margin = t.ppf((confi_level) , df) * sqrt(std1 ** 2 / (number1) + std2 ** 2 / (number2))
            tmp_model += [['true difference in means < {}'.format(hypo_diff)] + 
            [ttestresult[1]] + [(-math.inf, mean1 - mean2 + margin)]] 
            tmp_table += [['%s by %s(%s,%s)' % (response_col, factor_col, first, second)] + 
            ['true difference in means < {}'.format(hypo_diff)] + 
            ['t statistic, t distribution with %f degrees of freedom under the null hypothesis' % ttestresult[2]] + 
            [ttestresult[0]] + [ttestresult[1]] + [confi_level] + [-math.inf] + [mean1 - mean2 + margin]] 

        if 'two-sided' in alternatives:
            ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'two-sided', usevar=equal_vari, value=hypo_diff)
            df = ttestresult[2]
            if equal_vari == 'pooled':    
                std_number1number2 = sqrt(((number1 - 1) * (std1) ** 2 + (number2 - 1) * (std2) ** 2) / (number1 + number2 - 2))
                margin = t.ppf((confi_level + 1) / 2 , df) * std_number1number2 * sqrt(1 / number1 + 1 / number2)
            if equal_vari == 'unequal':
                margin = t.ppf((confi_level + 1) / 2 , df) * sqrt(std1 ** 2 / (number1) + std2 ** 2 / (number2))
            tmp_model += [['true difference in means != {}'.format(hypo_diff)] + 
            [ttestresult[1]] + [(mean1 - mean2 - margin, mean1 - mean2 + margin)]]
            tmp_table += [['%s by %s(%s,%s)' % (response_col, factor_col, first, second)] + 
            ['true difference in means != {}'.format(hypo_diff)] + 
            ['t statistic, t distribution with %f degrees of freedom under the null hypothesis' % ttestresult[2]] + 
            [ttestresult[0]] + [ttestresult[1]] + [confi_level] + [mean1 - mean2 - margin] + [mean1 - mean2 + margin]]

        result_model = pd.DataFrame.from_records(tmp_model)
        result_model.columns = ['alternative hypothesis', 'p-value', '%g%% confidence interval' % (confi_level * 100)]
        rb.addMD(strip_margin("""
        | #### Data = {response_col} by {factor_col}({first},{second})
        
        | - Statistics = t statistic, t distribution with {ttestresult2} degrees of freedom under the null hypothesis
        | - t-value = {ttestresult0}
        |
        | {result_model}
        |
        """.format(ttestresult2=ttestresult[2], response_col=response_col, factor_col=factor_col, first=first, second=second, ttestresult0=ttestresult[0], result_model=pandasDF2MD(result_model))))
        if start_auto == 1:
            equal_vari = 'auto'
    result = pd.DataFrame.from_records(tmp_table)
    result.columns = ['data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'lower_confidence_interval', 'upper_confidence_interval']

    model = dict()
    model['_repr_brtc_'] = rb.get()    
    return {'out_table' : result, 'model' : model}
Beispiel #14
0
def _normality_test(table,
                    input_cols,
                    method=['kstest', 'jarque_bera', 'anderson']):
    result = dict()
    rb = BrtcReprBuilder()
    rb.addMD("""## Normality test Result""")

    test_name = {
        'kstest': "Kolmogorov-Smirnov test",
        'jarque_bera': "Jarque-Bera test",
        'anderson': "Anderson-Darling test"
    }
    stats_name = {
        'kstest':
        "KS statistic, asymptotically Kolmogorov distribution under the null hypothesis.",
        'jarque_bera':
        "JB statistic, asymptotically chi-square distribution with 2 degrees of freedom under the null hypothesis.",
        'anderson':
        "A^2 statistic. The p-value is computed from the adjusted statistic."
    }

    if 'kstest' in method:
        stats_res = dict()
        stats_res['data'] = []
        stats_res['estimates'] = []
        stats_res['p_value'] = []
        result['kstest'] = dict()
        for input_col in input_cols:
            stats, pval = kstest(table[input_col], 'norm', mode='asymp')
            stats_res['data'].append(input_col)
            stats_res['estimates'].append(stats)
            stats_res['p_value'].append(pval)
            result['kstest'][input_col] = {'estimates': stats, 'p_value': pval}
        rb.addMD(
            strip_margin("""
        | ## {method} result
        |{stats_table}
        """.format(method=test_name['kstest'],
                   stats_table=pandasDF2MD(pd.DataFrame(stats_res)))))
    if 'jarque_bera' in method:
        stats_res = dict()
        stats_res['data'] = []
        stats_res['estimates'] = []
        stats_res['p_value'] = []
        result['jarque_bera'] = dict()
        for input_col in input_cols:
            stats, pval = jarque_bera(table[input_col])
            stats_res['data'].append(input_col)
            stats_res['estimates'].append(stats)
            stats_res['p_value'].append(pval)
            result['jarque_bera'][input_col] = {
                'estimates': stats,
                'p_value': pval
            }
        rb.addMD(
            strip_margin("""
        | ## {method} result
        |{stats_table}
        """.format(method=test_name['jarque_bera'],
                   stats_table=pandasDF2MD(pd.DataFrame(stats_res)))))
    if 'anderson' in method:
        stats_res = dict()
        stats_res['data'] = []
        stats_res['estimates'] = []
        stats_res['critical value'] = []
        stats_res['significance level'] = []
        result['anderson'] = dict()
        for input_col in input_cols:
            stats, critical_val, significance_lvl = anderson(table[input_col],
                                                             dist='norm')
            stats_res['data'] += [input_col] * len(critical_val)
            stats_res['estimates'] += [stats] * len(critical_val)
            stats_res['critical value'] += list(critical_val)
            stats_res['significance level'] += list(significance_lvl)
            result['anderson'][input_col] = {
                'estimates': [stats] * len(critical_val),
                'critical value': list(critical_val),
                'significance level': list(significance_lvl)
            }
        rb.addMD(
            strip_margin("""
        | ## {method} result
        |{stats_table}
        """.format(method=test_name['anderson'],
                   stats_table=pandasDF2MD(pd.DataFrame(stats_res)))))

    result['_repr_brtc_'] = rb.get()

    return {'result': result}
Beispiel #15
0
def _duncan_test(table, response_cols, factor_col, alpha=0.05):
    result = dict()
    rb = BrtcReprBuilder()
    rb.addMD("""## Duncan test Result""")
    
    for response_col in response_cols:
        mean_by_factor = table.groupby(factor_col).mean()[response_col].sort_values(ascending=False)
        count_by_factor = table.groupby(factor_col).count()[response_col]
        columns = list(table.columns)
        sse = np.sum([np.square(row[columns.index(response_col)] - mean_by_factor[row[columns.index(factor_col)]]) for row in table.values])
        df = table.shape[0] - count_by_factor.shape[0]
        mse = sse / df
        n = harmonic_mean(count_by_factor)
        sigma_d = np.sqrt(mse / n)
        classes = table[factor_col].unique()
        classes_cnt = len(classes)
        critical_val = dict()
        critical_val['p'] = range(2, classes_cnt + 1)
        critical_val['critical_value'] = []
        p = 1 - alpha
        for i in range(1, classes_cnt):
            if p < 0.1 or p > 0.999:
                critical_val['critical_value'].append('Not statistically meaningful')
            else:
                critical_val['critical_value'].append(sigma_d * qsturng(p, i + 1, df))
            p = p * (1 - alpha)
        comp_by_factor = dict()
        comp_by_factor['compared_factors'] = []
        comp_by_factor['difference'] = []
        comp_by_factor['critical_value'] = []
        comp_by_factor['significant'] = []
        titles = mean_by_factor.index
        for i in range(classes_cnt):
            for j in range(i + 1, classes_cnt):
                title = str(titles[i]) + ' - ' + str(titles[j])
                comp_by_factor['compared_factors'].append(title)
                difference = abs(mean_by_factor[titles[i]] - mean_by_factor[titles[j]])
                comp_by_factor['difference'].append(difference)
                critical_value = critical_val['critical_value'][critical_val['p'].index(j - i + 1)]
                comp_by_factor['critical_value'].append(critical_value)
                if isinstance(critical_value, (float, int)):
                    if difference > critical_value:
                        comp_by_factor['significant'].append('YES')
                    else:
                        comp_by_factor['significant'].append('NO')
                else:
                    comp_by_factor['significant'].append(critical_value)
        critical_val = pd.DataFrame(critical_val)
        mean_by_factor = pd.DataFrame(mean_by_factor).reset_index()
        comp_by_factor = pd.DataFrame(comp_by_factor)
            
        rb.addMD(strip_margin("""
        | ## {response_col} by {factor_col}
        |
        | ### Critical value
        | {critical_val}
        |
        | ### Mean value by factor
        | {mean_by_factor}
        |
        | ### Difference by factor
        | {comp_by_factor}
        """.format(response_col=response_col, factor_col=factor_col,
            critical_val=pandasDF2MD(critical_val, num_rows=critical_val.shape[0]),
            mean_by_factor=pandasDF2MD(mean_by_factor, num_rows=mean_by_factor.shape[0]),
            comp_by_factor=pandasDF2MD(comp_by_factor, num_rows=comp_by_factor.shape[0]))))
            
        group = response_col + '_' + factor_col
        result[group] = dict()
        result[group]['critical_val'] = critical_val
        result[group]['mean_by_factor'] = mean_by_factor
        result[group]['comp_by_factor'] = comp_by_factor
        
    result['_repr_brtc_'] = rb.get()
        
    return {'result': result}
Beispiel #16
0
def _pca(table,
         input_cols,
         new_column_name='projected_',
         n_components=None,
         copy=True,
         whiten=False,
         svd_solver='auto',
         tol=0.0,
         iterated_power='auto',
         seed=None,
         hue=None,
         alpha=0,
         key_col=None):

    num_feature_cols = len(input_cols)
    if n_components is None:
        n_components = num_feature_cols

    pca = PCA(None,
              copy,
              whiten,
              svd_solver,
              tol,
              iterated_power,
              random_state=seed)
    pca_model = pca.fit(table[input_cols])

    column_names = []
    for i in range(0, n_components):
        column_names.append(new_column_name + str(i))
    # print(column_names)

    pca_result = pca_model.transform(table[input_cols])
    out_df = pd.DataFrame(data=pca_result[:, :n_components],
                          columns=[column_names])

    out_df = pd.concat([table.reset_index(drop=True), out_df], axis=1)
    out_df.columns = table.columns.values.tolist() + column_names

    res_components = pca_model.components_
    res_components_df = pd.DataFrame(data=res_components[:n_components],
                                     columns=[input_cols])
    res_explained_variance = pca_model.explained_variance_
    res_explained_variance_ratio = pca_model.explained_variance_ratio_
    res_singular_values = pca_model.singular_values_
    res_mean = pca_model.mean_
    res_n_components = pca_model.n_components_
    res_noise_variance = pca_model.noise_variance_

    res_get_param = pca_model.get_params()
    res_get_covariance = pca_model.get_covariance()
    res_get_precision = pca_model.get_precision()

    # visualization
    plt.figure()
    if n_components == 1:
        sns.scatterplot(column_names[0], column_names[0], hue=hue, data=out_df)
        plt_two = plt2MD(plt)
        plt.clf()
    else:
        plt_two = _biplot(
            0,
            1,
            pc_columns=column_names,
            columns=input_cols,
            singular_values=res_singular_values,
            components=res_components,
            explained_variance_ratio=res_explained_variance_ratio,
            alpha=alpha,
            hue=hue,
            data=out_df,
            ax=plt.gca(),
            key_col=key_col)

    plt.figure()
    fig_scree = _screeplot(res_explained_variance,
                           res_explained_variance_ratio, n_components)

    table_explained_variance = pd.DataFrame(res_explained_variance,
                                            columns=['explained_variance'])
    table_explained_variance[
        'explained_variance_ratio'] = res_explained_variance_ratio
    table_explained_variance[
        'cum_explained_variance_ratio'] = res_explained_variance_ratio.cumsum(
        )

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## PCA Result
    | ### Plot
    | {image1}
    |
    | ### Explained Variance
    | {fig_scree}
    | {table_explained_variance}    
    |
    | ### Components
    | {table2}
    |
    | ### Parameters
    | {parameter1}
    """.format(image1=plt_two,
               fig_scree=fig_scree,
               table_explained_variance=pandasDF2MD(table_explained_variance),
               parameter1=dict2MD(res_get_param),
               table2=pandasDF2MD(res_components_df))))

    model = _model_dict('pca')
    model['components'] = res_components
    model['explained_variance'] = res_explained_variance
    model['explained_variance_ratio'] = res_explained_variance_ratio
    model['singular_values'] = res_singular_values
    model['mean'] = res_mean
    model['n_components'] = res_n_components
    model['noise_variance'] = res_noise_variance
    model['parameters'] = res_get_param
    model['covariance'] = res_get_covariance
    model['precision'] = res_get_precision
    model['_repr_brtc_'] = rb.get()
    model['pca_model'] = pca_model
    model['input_cols'] = input_cols

    return {'out_table': out_df, 'model': model}
Beispiel #17
0
def _cross_table(table, input_cols_1, input_cols_2, result='N', margins=False):

    df1 = [table[col] for col in input_cols_1]
    df2 = [table[col] for col in input_cols_2]

    # cross table
    if result == 'N':
        result_table = pd.crosstab(df1, df2, margins=margins)
    elif result == 'N / Row Total':
        result_table = pd.crosstab(df1,
                                   df2,
                                   margins=margins,
                                   normalize='index')
    elif result == 'N / Column Total':
        result_table = pd.crosstab(df1,
                                   df2,
                                   margins=margins,
                                   normalize='columns')
    elif result == 'N / Total':
        result_table = pd.crosstab(df1, df2, margins=margins, normalize='all')
    else:
        raise_runtime_error("Please check 'result'.")

    # each row and column name
    row_names = list(result_table.index)[:]
    if len(input_cols_1) == 1:
        joined_row_name = [str(i) for i in row_names]
    else:
        if margins == False:
            joined_row_name = [
                '_'.join(str(s) for s in row_names[i])
                for i in range(len(row_names))
            ]
        elif margins == True:
            joined_row_name = [
                '_'.join(str(s) for s in row_names[i])
                for i in range(len(row_names) - 1)
            ] + [row_names[-1][0]]

    column_names = list(result_table.columns)[:]
    if len(input_cols_2) == 1:
        joined_column_name = [str(i) for i in column_names]
    else:
        if margins == False:
            joined_column_name = [
                '_'.join(str(s) for s in column_names[i])
                for i in range(len(column_names))
            ]
        elif margins == True:
            joined_column_name = [
                '_'.join(str(s) for s in column_names[i])
                for i in range(len(column_names) - 1)
            ] + [column_names[-1][0]]

    # cross table
    if result == 'N':
        result_table.insert(loc=0, column=' ', value=joined_row_name)
        result_table.columns = np.append('N', joined_column_name)
    # cross table normalize by row
    elif result == 'N / Row Total':
        result_table.insert(loc=0, column=' ', value=joined_row_name)
        result_table.columns = np.append('N / Row Total', joined_column_name)
    # cross table normalize by column
    elif result == 'N / Column Total':
        result_table.insert(loc=0, column=' ', value=joined_row_name)
        result_table.columns = np.append('N / Column Total',
                                         joined_column_name)
    # cross table normalize by all values
    elif result == 'N / Total':
        result_table.insert(loc=0, column=' ', value=joined_row_name)
        result_table.columns = np.append('N / Total', joined_column_name)
    else:
        raise_runtime_error("Please check 'result'.")

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Cross Table Result
    | ### Result Type : {result}
    |
    | #### Result Table
    |
    | {result_table}
    |
    """.format(result=result,
               result_table=pandasDF2MD(result_table,
                                        num_rows=len(result_table.index) +
                                        1))))

    model = _model_dict('cross_table')
    model['result'] = result
    model['result_table'] = result_table
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Beispiel #18
0
def _mlp_regression_train(table,
                          feature_cols,
                          label_col,
                          hidden_layer_sizes=(100, ),
                          activation='relu',
                          solver='adam',
                          alpha=0.0001,
                          batch_size_auto=True,
                          batch_size='auto',
                          learning_rate='constant',
                          learning_rate_init=0.001,
                          max_iter=200,
                          random_state=None,
                          tol=0.0001):
    features = table[feature_cols]
    label = table[label_col]

    mlp_model = MLPRegressor(hidden_layer_sizes=hidden_layer_sizes,
                             activation=activation,
                             solver=solver,
                             alpha=alpha,
                             batch_size=batch_size,
                             learning_rate=learning_rate,
                             learning_rate_init=learning_rate_init,
                             max_iter=max_iter,
                             shuffle=True,
                             random_state=random_state,
                             tol=tol)
    mlp_model.fit(features, label)

    predict = mlp_model.predict(features)

    intercepts = mlp_model.intercepts_
    coefficients = mlp_model.coefs_
    loss = mlp_model.loss_

    _mean_absolute_error = mean_absolute_error(label, predict)
    _mean_squared_error = mean_squared_error(label, predict)
    _r2_score = r2_score(label, predict)

    result_table = pd.DataFrame.from_items(
        [['Metric', ['Mean Absolute Error', 'Mean Squared Error', 'R2 Score']],
         ['Score', [_mean_absolute_error, _mean_squared_error, _r2_score]]])

    label_name = {
        'hidden_layer_sizes': 'Hidden Layer Sizes',
        'activation': 'Activation Function',
        'solver': 'Solver',
        'alpha': 'Alpha',
        'batch_size': 'Batch Size',
        'learning_rate': 'Learning Rate',
        'learning_rate_init': 'Learning Rate Initial',
        'max_iter': 'Max Iteration',
        'random_state': 'Seed',
        'tol': 'Tolerance'
    }
    get_param = mlp_model.get_params()
    param_table = pd.DataFrame.from_items(
        [['Parameter', list(label_name.values())],
         ['Value', [get_param[x] for x in list(label_name.keys())]]])

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ### MLP Classification Result
    | {result}
    | ### Parameters
    | {list_parameters}
    """.format(result=pandasDF2MD(result_table),
               list_parameters=pandasDF2MD(param_table))))

    model = _model_dict('mlp_regression_model')
    model['features'] = feature_cols
    model['label'] = label_col
    model['intercepts'] = mlp_model.intercepts_
    model['coefficients'] = mlp_model.coefs_
    model['loss'] = mlp_model.loss_
    model['mean_absolute_error'] = _mean_absolute_error
    model['mean_squared_error'] = _mean_squared_error
    model['r2_score'] = _r2_score
    model['activation'] = activation
    model['solver'] = solver
    model['alpha'] = alpha
    model['batch_size'] = batch_size
    model['learning_rate'] = learning_rate
    model['learning_rate_init'] = learning_rate_init
    model['max_iter'] = max_iter
    model['random_state'] = random_state
    model['tol'] = tol
    model['mlp_model'] = mlp_model
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
def _hierarchical_clustering_post(model, num_clusters, cluster_col='cluster'):
    Z = model['model']
    mode = model['input_mode']
    out_table = model['linkage_matrix']

    predict = fcluster(Z, t=num_clusters, criterion='maxclust')
    if mode == 'original':
        prediction_table = model['table']
    elif mode == 'matrix':
        prediction_table = model['dist_matrix'][['name']]
    if num_clusters == 1:
        prediction_table[cluster_col] = [
            1 for _ in range(len(prediction_table.index))
        ]
    else:
        prediction_table[cluster_col] = predict

    L, M = leaders(Z, predict)
    which_cluster = []
    for leader in L:
        if leader in Z[:, 0]:
            select_indices = np.where(Z[:, 0] == leader)[0][0]
            which_cluster.append(out_table['joined column1'][select_indices])
        elif leader in Z[:, 1]:
            select_indices = np.where(Z[:, 1] == leader)[0][0]
            which_cluster.append(out_table['joined column2'][select_indices])

    clusters_info_table = pd.DataFrame([])
    if num_clusters == 1:
        clusters_info_table[cluster_col] = [1]
        clusters_info_table['name of clusters'] = [
            out_table['name of clusters'][len(Z) - 1]
        ]
        clusters_info_table['number of entities'] = [
            out_table['number of original'][len(Z) - 1]
        ]
    else:
        clusters_info_table[cluster_col] = M
        clusters_info_table['name of clusters'] = which_cluster
        clusters_info_table = clusters_info_table.sort_values(cluster_col)
        cluster_count = np.bincount(prediction_table[cluster_col])
        cluster_count = cluster_count[cluster_count != 0]
        clusters_info_table['number of entities'] = list(cluster_count)

    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""# Hierarchical Clustering Post Process Result"""))
    rb.addMD(
        strip_margin("""
    |### Parameters
    |
    |{display_params}
    |
    |### Clusters Information
    |
    |{clusters_info_table}
    |
    """.format(display_params=dict2MD(model['parameters']),
               clusters_info_table=pandasDF2MD(
                   clusters_info_table,
                   num_rows=len(clusters_info_table.index) + 1))))

    model = _model_dict('hierarchical_clustering_post_process')
    model['clusters_info'] = clusters_info_table
    model['_repr_brtc_'] = rb.get()

    return {'out_table': prediction_table, 'model': model}
Beispiel #20
0
def _oneway_anova(table, response_cols, factor_col):
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    ## One-way Analysis of Variance Result
    """))
    groups = table[factor_col].unique()
    groups.sort()
    sum_len = np.sum([len(str(group)) for group in groups])

    result = dict()
    result['_grouped_data'] = dict()

    for response_col in response_cols:
        data = table[response_col]
        result['_grouped_data'][response_col] = dict()

        ax = sns.boxplot(x=factor_col,
                         y=response_col,
                         data=table,
                         order=groups)
        if sum_len > 512:
            ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
        elif sum_len > 64:
            ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

        fig_box = plt2MD(plt)
        plt.clf()

        model = ols(
            """Q('{response_col}') ~ C(Q('{factor_col}'))""".format(
                response_col=response_col, factor_col=factor_col),
            table).fit()  # TODO factor_col = class => error
        anova = anova_lm(model)

        index_list = anova.index.tolist()
        remove_list = ["C(Q('", "'))", "Q('", "')"]
        for v in remove_list:
            index_list = [i.replace(v, "") for i in index_list]
        anova.insert(0, '', index_list)

        anova_df = pandasDF2MD(anova)

        p_value = anova["""PR(>F)"""][0]

        residual = model.resid

        sns.distplot(residual)
        distplot = plt2MD(plt)
        plt.clf()

        sm.qqplot(residual, line='s')
        qqplot = plt2MD(plt)
        plt.clf()

        rb.addMD(
            strip_margin("""
        | ## {response_col} by {factor_col}
        | {fig_box}
        |
        | ### ANOVA
        | {anova_df}
        | 
        | ### Diagnostics
        | {distplot}
        |
        | {qqplot}
        """.format(response_col=response_col,
                   factor_col=factor_col,
                   fig_box=fig_box,
                   anova_df=anova_df,
                   distplot=distplot,
                   qqplot=qqplot)))

        result['_grouped_data'][response_col]['p_value'] = p_value

    result['_repr_brtc_'] = rb.get()
    return {'result': result}
def _penalized_linear_regression_train(table, feature_cols, label_col, regression_type='ridge', alpha=1.0, l1_ratio=0.5, fit_intercept=True, max_iter=1000, tol=0.0001, random_state=None):
    out_table = table.copy()
    features = out_table[feature_cols]
    label = out_table[label_col]
    if regression_type == 'ridge':
        regression_model = Ridge(alpha=alpha, fit_intercept=fit_intercept, max_iter=None, tol=tol, solver='auto', random_state=random_state)
    elif regression_type == 'lasso':
        regression_model = Lasso(alpha=alpha, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, random_state=random_state, selection='random')
    elif regression_type == 'elastic_net':
        regression_model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, random_state=random_state, selection='random')
    else:
        raise_runtime_error("Please check 'regression_type'.")
    
    regression_model.fit(features, label)
    
    out_table1 = pd.DataFrame([])
    out_table1['x_variable_name'] = [variable for variable in feature_cols]
    out_table1['coefficient'] = regression_model.fit(features, label).coef_
    intercept = pd.DataFrame([['intercept', regression_model.fit(features, label).intercept_]], columns=['x_variable_name', 'coefficient'])
    if fit_intercept == True:
        out_table1 = out_table1.append(intercept, ignore_index=True)
        
    predict = regression_model.predict(features)
    residual = label - predict
    
    out_table['predict'] = predict
    out_table['residual'] = residual

    if regression_type == 'elastic_net':
        params = {
        
        'Feature Columns' : feature_cols,
        'Label Column' : label_col,
        'Regression Type': regression_type,
        'Regularization (Penalty Weight)' : alpha,
        'L1 Ratio': l1_ratio,
        'Fit Intercept' : fit_intercept,
        'Maximum Number of Iterations' : max_iter,
        'Tolerance' : tol
        
        }
    else:
        params = {
        
        'Feature Columns' : feature_cols,
        'Label Column' : label_col,
        'Regression Type': regression_type,
        'Regularization (Penalty Weight)' : alpha,
        'Fit Intercept' : fit_intercept,
        'Maxium Number of Iterations' : max_iter,
        'Tolerance' : tol
        
        }
    
    score = {
        'MSE' : mean_squared_error(label, predict),
        'R2' : r2_score(label, predict)
    }

    plt.figure()
    plt.scatter(predict, label)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Actual values for ' + label_col)
    x = predict
    p1x = np.min(x)
    p2x = np.max(x)
    plt.plot([p1x, p2x], [p1x, p2x], 'r--')
    fig_actual_predict = plt2MD(plt)
    plt.clf()

    plt.figure()
    plt.scatter(predict, residual)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Residuals')
    plt.axhline(y=0, color='r', linestyle='--')
    fig_residual_1 = plt2MD(plt)
    plt.clf()

    plt.figure()
    sm.qqplot(residual, line='s')
    plt.ylabel('Residuals')
    fig_residual_2 = plt2MD(plt)
    plt.clf()

    plt.figure()
    sns.distplot(residual)
    plt.xlabel('Residuals')
    fig_residual_3 = plt2MD(plt)
    plt.clf()
    
    # checking the magnitude of coefficients
    
    plt.figure()
    predictors = features.columns
    coef = Series(regression_model.coef_, predictors).sort_values()
    coef.plot(kind='bar', title='Model Coefficients')
    plt.tight_layout()
    fig_model_coefficients = plt2MD(plt)
    plt.clf()

    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | # Penalized Linear Regression Result
    | ### Selected Parameters: 
    | {params}
    |
    | ## Results
    | ### Model Parameters
    | {out_table1}
    |
    | ### Prediction and Residual
    | {out_table2}
    |
    | ### Regression Score
    | {score}
    |
    """.format(params=dict2MD(params), out_table1=pandasDF2MD(out_table1), out_table2=pandasDF2MD(out_table, num_rows=len(out_table) + 1), score=dict2MD(score))))
    rb.addMD(strip_margin("""
    |
    | ### Predicted vs Actual
    | {image1}
    |
    | ### Fit Diagnostics
    | {image2}
    | {image3}
    | {image4}
    |
    | ### Magnitude of Coefficients
    | {image5}
    |
    """.format(image1=fig_actual_predict,
               image2=fig_residual_1,
               image3=fig_residual_2,
               image4=fig_residual_3,
               image5=fig_model_coefficients
               )))

    model = _model_dict('penalized_linear_regression_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    model['regression_type'] = regression_type
    model['regression_model'] = regression_model
    model['model_parameters'] = out_table1
    model['prediction_residual'] = out_table
    model['_repr_brtc_'] = rb.get()

    return {'model' : model}
Beispiel #22
0
def _logistic_regression_train(table,
                               feature_cols,
                               label_col,
                               penalty='l2',
                               dual=False,
                               tol=0.0001,
                               C=1.0,
                               fit_intercept=True,
                               intercept_scaling=1,
                               class_weight=None,
                               random_state=None,
                               solver='liblinear',
                               max_iter=100,
                               multi_class='ovr',
                               verbose=0,
                               warm_start=False,
                               n_jobs=1):

    feature_names, features = check_col_type(table, feature_cols)
    label = table[label_col]

    if (sklearn_utils.multiclass.type_of_target(label) == 'continuous'):
        raise_error('0718', 'label_col')

    lr_model = LogisticRegression(penalty, dual, tol, C, fit_intercept,
                                  intercept_scaling, class_weight,
                                  random_state, solver, max_iter, multi_class,
                                  verbose, warm_start, n_jobs)
    lr_model.fit(features, label)
    new_features = pd.DataFrame({
        "Constant": np.ones(len(features))
    }).join(pd.DataFrame(features))
    intercept = lr_model.intercept_
    coefficients = lr_model.coef_
    classes = lr_model.classes_
    is_binary = len(classes) == 2
    prob = lr_model.predict_proba(features)
    prob_trans = prob.T
    classes_dict = dict()
    for i in range(len(classes)):
        classes_dict[classes[i]] = i
    tmp_label = np.array([classes_dict[i] for i in label])
    likelihood = 1
    for i in range(len(table)):
        likelihood *= prob_trans[tmp_label[i]][i]
    if fit_intercept:
        k = len(feature_cols) + 1
    else:
        k = len(feature_cols)
    aic = 2 * k - 2 * np.log(likelihood)
    bic = np.log(len(table)) * k - 2 * np.log(likelihood)
    if is_binary:
        if fit_intercept:
            x_design = np.hstack([np.ones((features.shape[0], 1)), features])
        else:
            x_design = features.values
        v = np.product(prob, axis=1)
        x_design_modi = np.array(
            [x_design[i] * v[i] for i in range(len(x_design))])
        cov_logit = np.linalg.inv(np.dot(x_design_modi.T, x_design))
        std_err = np.sqrt(np.diag(cov_logit))
        if fit_intercept:
            logit_params = np.insert(coefficients, 0, intercept)
        else:
            logit_params = coefficients
        wald = (logit_params / std_err)**2
        p_values = 1 - chi2.cdf(wald, 1)
    else:
        if fit_intercept:
            x_design = np.hstack([np.ones((features.shape[0], 1)), features])
        else:
            x_design = features.values
        std_err = []
        for i in range(len(classes)):
            v = prob.T[i] * (1 - prob.T[i])
            x_design_modi = np.array(
                [x_design[i] * v[i] for i in range(len(x_design))])
            cov_logit = np.linalg.inv(np.dot(x_design_modi.T, x_design))
            std_err.append(np.sqrt(np.diag(cov_logit)))
        std_err = np.array(std_err)

        #print(math.log(likelihood))

    if (fit_intercept == True):
        summary = pd.DataFrame({'features': ['intercept'] + feature_names})
        coef_trans = np.concatenate(([intercept], np.transpose(coefficients)),
                                    axis=0)

    else:
        summary = pd.DataFrame({'features': feature_names})
        coef_trans = np.transpose(coefficients)

    if not is_binary:
        summary = pd.concat(
            (summary, pd.DataFrame(coef_trans, columns=classes)), axis=1)
    else:
        summary = pd.concat(
            (summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1)
    if is_binary:
        summary = pd.concat(
            (summary, pd.DataFrame(std_err, columns=['standard_error']),
             pd.DataFrame(wald, columns=['wald_statistic']),
             pd.DataFrame(p_values, columns=['p_value'])),
            axis=1)
    else:
        columns = [
            'standard_error_{}'.format(classes[i]) for i in range(len(classes))
        ]
        summary = pd.concat(
            (summary, pd.DataFrame(std_err.T, columns=columns)), axis=1)
        arrange_col = ['features']
        for i in range(len(classes)):
            arrange_col.append(classes[i])
            arrange_col.append('standard_error_{}'.format(classes[i]))
        summary = summary[arrange_col]
    if is_binary:
        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""
        | ## Logistic Regression Result
        | ### Summary
        | {table1}
        |
        | ##### Column '{small}' is the coefficients under the assumption ({small} = 0, {big} = 1).
        |
        | #### AIC : {aic}
        |
        | #### BIC : {bic}
        """.format(small=classes[0],
                   big=classes[1],
                   table1=pandasDF2MD(summary, num_rows=100),
                   aic=aic,
                   bic=bic)))
    else:
        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""
        | ## Logistic Regression Result
        | ### Summary
        | {table1}
        |
        | ##### Each column whose name is one of classes of Label Column is the coefficients under the assumption it is 1 and others are 0.
        |
        | ##### For example, column '{small}' is the coefficients under the assumption ({small} = 1, others = 0).
        |
        | #### AIC : {aic}
        |
        | #### BIC : {bic}
        """.format(small=classes[0],
                   table1=pandasDF2MD(summary, num_rows=100),
                   aic=aic,
                   bic=bic)))

    model = _model_dict('logistic_regression_model')
    model['standard_errors'] = std_err
    model['aic'] = aic
    model['bic'] = bic
    if is_binary:
        model['wald_statistics'] = wald
        model['p_values'] = p_values
    model['features'] = feature_cols
    model['label'] = label_col
    model['intercept'] = lr_model.intercept_
    model['coefficients'] = lr_model.coef_
    model['class'] = lr_model.classes_
    model['penalty'] = penalty
    model['solver'] = solver
    model['lr_model'] = lr_model
    model['_repr_brtc_'] = rb.get()
    model['summary'] = summary
    return {'model': model}
Beispiel #23
0
def _als_train(table, user_col, item_col, rating_col, mode = 'train', number=10, filter = True, implicit = False, iterations = 10, reg_param = 0.1, rank = 10, alpha = 1.0, seed = None, targets = None, workers = 1):
    table_user_col = table[user_col]
    table_item_col = table[item_col]
    rating_col = table[rating_col]
    rating_col = np.where(rating_col == 0, -1, rating_col)
    user_encoder = preprocessing.LabelEncoder()
    item_encoder = preprocessing.LabelEncoder()
    user_encoder.fit(table_user_col)
    item_encoder.fit(table_item_col)
    user_correspond = user_encoder.transform(table_user_col)
    item_correspond = item_encoder.transform(table_item_col)       
    item_users = csr_matrix((rating_col,(item_correspond,user_correspond)))
    als_model = AlternatingLeastSquares(factors = rank,implicit = implicit,iterations = iterations, regularization = reg_param, alpha = alpha, seed = seed)
    als_model.fit(item_users)
    tmp_col = list(als_model.user_factors)
    for i in range(len(tmp_col)):
        tmp_col[i] = list(tmp_col[i])
    user_factors = pd.DataFrame(user_encoder.classes_, columns = [user_col])
    user_factors['features'] = tmp_col
    tmp_col = list(als_model.item_factors)
    for i in range(len(tmp_col)):
        tmp_col[i] = list(tmp_col[i])
    item_factors = pd.DataFrame(item_encoder.classes_, columns = [item_col])
    item_factors['features'] = tmp_col
    if mode == 'Topn':
        if targets is None:
            targets = user_encoder.classes_
        if table_user_col.dtype in (np.floating,float,np.int,int,np.int64):
            targets = [float(i) for i in targets]
        targets_en = user_encoder.transform(targets)
        user_items = item_users.T.tocsr()
        Topn_result = []
        if workers == 1:
            for user in targets_en:
                recommendations_corre = als_model.recommend(user, user_items, number, filter_already_liked_items= filter)
                recommendations = []
                for (item,rating) in recommendations_corre:
                    recommendations += [item_encoder.inverse_transform([item])[0],rating]
                Topn_result += [recommendations]
        else:
            Topn_result_tmp = apply_by_multiprocessing_list_to_list(targets_en, _recommend_multi, user_items = user_items, number = number, item_encoder = item_encoder, als_model = als_model, workers = workers, filter = filter)
            Topn_result=[]
            for i in range(workers):
                Topn_result += Topn_result_tmp[i]
        Topn_result = pd.DataFrame(Topn_result)
        Topn_result = pd.concat([pd.DataFrame(targets), Topn_result], axis=1, ignore_index=True)
        column_names=['user']
        for i in range(number):
            column_names += ['item_top%d' %(i+1),'rating_top%d' %(i+1)]
        Topn_result.columns = column_names
        return {'out_table' : Topn_result}
        
    parameters = dict()
    parameters['Iterations'] = iterations
    parameters['Reg Param'] = reg_param
    parameters['Seed'] = seed
    parameters['Rank'] = rank
    if implicit:
        parameters['alpha'] = alpha
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## ALS Train Result
    |
    | ### Parameters
    | {parameters} 
    | ### Item Factors
    | {item_factors}
    | ### User Factors
    | {user_factors}
    |
    """.format(item_factors=pandasDF2MD(item_factors, num_rows = 100), user_factors=pandasDF2MD(user_factors, num_rows = 100), parameters=dict2MD(parameters))))

    model = _model_dict('ALS')
    model['als_model'] = als_model
    model['item_encoder'] = item_encoder
    model['user_encoder'] = user_encoder
    model['user_col'] = user_col
    model['item_col'] = item_col
    model['user_factors'] = user_factors
    model['item_factors'] = item_factors
    model['_repr_brtc_'] = rb.get()
    return{'model' : model}
Beispiel #24
0
def dataframe_to_md(table, n=20, precision=None, max_width=None):
    return pandasDF2MD(table, num_rows=n)
Beispiel #25
0
def _hierarchical_clustering_post(model, num_clusters, cluster_col='cluster'):
    if 'linkage_matrix' not in model:
        model_table = model['table_1']
        length = len(model_table) + 1
        tmp_table = model_table[[
            'clusters_joined1', 'clusters_joined2', 'height', 'frequency'
        ]]

        tmp = [
            i for i in tmp_table[['clusters_joined1', 'clusters_joined2'
                                  ]].values.flatten()
            if i.split("_")[0] != 'CL'
        ]
        label_encoder = preprocessing.LabelEncoder().fit(tmp)
        tmp_table['clusters_joined2'] = tmp_table['clusters_joined2'].apply(
            _change_name, length=length, encoder=label_encoder)
        tmp_table['clusters_joined1'] = tmp_table['clusters_joined1'].apply(
            _change_name, length=length, encoder=label_encoder)
        Z = tmp_table.values
        predict = fcluster(Z, t=num_clusters, criterion='maxclust')
        data_names = ['pt_' + str(i) for i in range(length)]
        prediction_table = pd.DataFrame()
        prediction_table['name'] = data_names
    else:
        Z = model['model']
        mode = model['input_mode']
        out_table = model['linkage_matrix']
        predict = fcluster(Z, t=num_clusters, criterion='maxclust')
        if mode == 'original':
            prediction_table = model['table']
        elif mode == 'matrix':
            prediction_table = model['dist_matrix'][['name']]
    if num_clusters == 1:
        prediction_table[cluster_col] = [
            1 for _ in range(len(prediction_table.index))
        ]
    else:
        prediction_table[cluster_col] = predict

    L, M = leaders(Z, predict)
    which_cluster = []
    if 'linkage_matrix' not in model:
        for leader in L:
            which_cluster.append('CL_' + str(2 * length - 1 - leader))
    else:
        for leader in L:
            if leader in Z[:, 0]:
                select_indices = np.where(Z[:, 0] == leader)[0][0]
                which_cluster.append(
                    out_table['joined column1'][select_indices])
            elif leader in Z[:, 1]:
                select_indices = np.where(Z[:, 1] == leader)[0][0]
                which_cluster.append(
                    out_table['joined column2'][select_indices])

    clusters_info_table = pd.DataFrame([])
    if num_clusters == 1 and 'linkage_matrix' in model:
        clusters_info_table[cluster_col] = [1]
        clusters_info_table['name of clusters'] = [
            out_table['name of clusters'][len(Z) - 1]
        ]
        clusters_info_table['number of entities'] = [
            out_table['number of original'][len(Z) - 1]
        ]
    else:
        clusters_info_table[cluster_col] = M
        clusters_info_table['name of clusters'] = which_cluster
        clusters_info_table = clusters_info_table.sort_values(cluster_col)
        cluster_count = np.bincount(prediction_table[cluster_col])
        cluster_count = cluster_count[cluster_count != 0]
        clusters_info_table['number of entities'] = list(cluster_count)
    if 'linkage_matrix' in model:
        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""# Hierarchical Clustering Post Process Result"""))
        rb.addMD(
            strip_margin("""
        |### Parameters
        |
        |{display_params}
        |
        |### Clusters Information
        |
        |{clusters_info_table}
        |
        """.format(display_params=dict2MD(model['parameters']),
                   clusters_info_table=pandasDF2MD(
                       clusters_info_table,
                       num_rows=len(clusters_info_table.index) + 1))))
    else:
        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""# Hierarchical Clustering Post Process Result"""))
        rb.addMD(
            strip_margin("""
        |
        |### Clusters Information
        |
        |{clusters_info_table}
        |
        """.format(clusters_info_table=pandasDF2MD(
                clusters_info_table,
                num_rows=len(clusters_info_table.index) + 1))))
    model = _model_dict('hierarchical_clustering_post_process')
    model['clusters_info'] = clusters_info_table
    model['_repr_brtc_'] = rb.get()

    return {'out_table': prediction_table, 'model': model}
Beispiel #26
0
def _ftest_for_stacked_data(table,
                            response_cols,
                            factor_col,
                            alternatives,
                            first=None,
                            second=None,
                            confi_level=0.95):
    if (type(table[factor_col][0]) != str):
        if (type(table[factor_col][0]) == bool):
            if (first != None):
                first = bool(first)
            if (second != None):
                second = bool(second)
        else:
            if (first != None):
                first = float(first)
            if (second != None):
                second = float(second)
    if (first == None or second == None):
        tmp_factors = []
        if (first != None):
            tmp_factors += [first]
        if (second != None):
            tmp_factors += [second]
        for i in range(len(table[factor_col])):
            if (table[factor_col][i] != None
                    and table[factor_col][i] not in tmp_factors):
                if (len(tmp_factors) == 2):
                    raise Exception("There are more that 2 factors.")
                else:
                    tmp_factors += [table[factor_col][i]]
    if (first == None):
        if (tmp_factors[0] != second):
            first = tmp_factors[0]
        else:
            first = tmp_factors[1]
    if (second == None):
        if (tmp_factors[0] != first):
            second = tmp_factors[0]
        else:
            second = tmp_factors[1]
    table_first = table[table[factor_col] == first]
    table_second = table[table[factor_col] == second]
    tmp_table = []
    number1 = len(table_first[factor_col])
    number2 = len(table_second[factor_col])
    d_num = number1 - 1
    d_denum = number2 - 1
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    ## F Test for Stacked Data Result
    | - Confidence level = {confi_level}
    | - Statistics = F statistic, F distribution with {d_num} numerator degrees of freedom and {d_denum} degrees of freedom under the null hypothesis
    """.format(confi_level=confi_level, d_num=d_num, d_denum=d_denum)))

    for response_col in response_cols:
        tmp_model = []
        std1 = (table_first[response_col]).std()
        std2 = (table_second[response_col]).std()
        f_value = (std1**2) / (std2**2)

        if 'larger' in alternatives:
            p_value = scipy.stats.f.cdf(1 / f_value, d_num, d_denum)
            tmp_model += [
                ['true ratio > 1'] + [p_value] +
                [(f_value /
                  (scipy.stats.f.ppf(confi_level, d_num, d_denum)), math.inf)]
            ]
            tmp_table += [[
                '%s by %s(%s,%s)' % (response_col, factor_col, first, second)
            ] + ['true ratio of variances > 1'] + [
                'F statistic, F distribution with %d numerator degrees of freedom and %d degrees of freedom under the null hypothesis.'
                % (d_num, d_denum)
            ] + [f_value] + [p_value] + [confi_level] + [
                f_value / (scipy.stats.f.ppf(confi_level, d_num, d_denum))
            ] + [math.inf]]

        if 'smaller' in alternatives:
            p_value = scipy.stats.f.cdf(f_value, d_num, d_denum)
            tmp_model += [['true ratio < 1'] + [p_value] +
                          [(0.0, f_value *
                            (scipy.stats.f.ppf(confi_level, d_denum, d_num)))]]
            tmp_table += [[
                '%s by %s(%s,%s)' % (response_col, factor_col, first, second)
            ] + ['true ratio of variances < 1'] + [
                'F statistic, F distribution with %d numerator degrees of freedom and %d degrees of freedom under the null hypothesis.'
                % (d_num, d_denum)
            ] + [f_value] + [p_value] + [confi_level] + [0.0] + [
                f_value * (scipy.stats.f.ppf(confi_level, d_denum, d_num))
            ]]

        if 'two-sided' in alternatives:
            p_value_tmp = scipy.stats.f.cdf(1 / f_value, d_num, d_denum)
            if (p_value_tmp > 0.5):
                p_value = (1 - p_value_tmp) * 2
            else:
                p_value = p_value_tmp * 2
            tmp_model += [
                ['true ratio != 1'] + [p_value] +
                [(f_value / (scipy.stats.f.ppf(
                    (1 + confi_level) / 2, d_num, d_denum)), f_value *
                  (scipy.stats.f.ppf((1 + confi_level) / 2, d_denum, d_num)))]
            ]
            tmp_table += [[
                '%s by %s(%s,%s)' % (response_col, factor_col, first, second)
            ] + ['true ratio of variances != 1'] + [
                'F statistic, F distribution with %d numerator degrees of freedom and %d degrees of freedom under the null hypothesis.'
                % (d_num, d_denum)
            ] + [f_value] + [p_value] + [confi_level] + [
                f_value / (scipy.stats.f.ppf(
                    (1 + confi_level) / 2, d_num, d_denum))
            ] + [
                f_value * (scipy.stats.f.ppf(
                    (1 + confi_level) / 2, d_denum, d_num))
            ]]

        result_model = pd.DataFrame.from_records(tmp_model)
        result_model.columns = [
            'alternative_hypothesis', 'p-value',
            '%g%% confidence interval' % (confi_level * 100)
        ]
        rb.addMD(
            strip_margin("""
        | #### Data = {response_col} by {factor_col}({first},{second})
        | - F-value = {f_value}
        |
        | {result_model}
        |
        """.format(response_col=response_col,
                   factor_col=factor_col,
                   first=first,
                   second=second,
                   f_value=f_value,
                   result_model=pandasDF2MD(result_model))))

    result = pd.DataFrame.from_records(tmp_table)
    result.columns = [
        'data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value',
        'confidence_level', 'lower_confidence_interval',
        'upper_confidence_interval'
    ]

    model = dict()
    model['_repr_brtc_'] = rb.get()
    return {'out_table': result, 'model': model}
Beispiel #27
0
def _hierarchical_clustering(table,
                             input_cols,
                             input_mode='original',
                             key_col=None,
                             link='complete',
                             met='euclidean',
                             num_rows=20,
                             figure_height=6.4,
                             orient='right'):
    out_table = table.copy()
    feature_names, features = check_col_type(out_table, input_cols)

    if input_mode == 'original':
        len_features = len(features)
        if key_col != None:
            data_names = list(out_table[key_col])
        elif key_col == None:
            data_names = ['pt_' + str(i) for i in range(len_features)]
        out_table['name'] = data_names
        Z = linkage(ssd.pdist(features, metric=met), method=link, metric=met)
    elif input_mode == 'matrix':
        len_features = len(input_cols)
        if key_col != None:
            data_names = []
            for column in input_cols:
                data_names.append(
                    out_table[key_col][out_table.columns.get_loc(column)])
        elif key_col == None:
            data_names = []
            for column in input_cols:
                data_names.append(
                    out_table.columns[out_table.columns.get_loc(column)])
        col_index = []
        for column in input_cols:
            col_index.append(out_table.columns.get_loc(column))
        dist_matrix = features.iloc[col_index]

        Z = linkage(ssd.squareform(dist_matrix), method=link, metric=met)
        dist_matrix['name'] = data_names
    else:
        raise_runtime_error("Please check 'input_mode'.")

    range_len_Z = range(len(Z))
    linkage_matrix = pd.DataFrame([])
    linkage_matrix['linkage step'] = [
        '%g' % (x + 1) for x in reversed(range_len_Z)
    ]
    linkage_matrix['name of clusters'] = [
        'CL_%g' % (i + 1) for i in reversed(range_len_Z)
    ]
    joined_column1 = []
    for i in range_len_Z:
        if Z[:, 0][i] < len_features:
            joined_column1.append(data_names[int(Z[:, 0][i])])
        elif Z[:, 0][i] >= len_features:
            joined_column1.append(
                linkage_matrix['name of clusters'][Z[:, 0][i] - len_features])
    linkage_matrix['joined column1'] = joined_column1
    joined_column2 = []
    for i in range_len_Z:
        if Z[:, 1][i] < len_features:
            joined_column2.append(data_names[int(Z[:, 1][i])])
        elif Z[:, 1][i] >= len_features:
            joined_column2.append(
                linkage_matrix['name of clusters'][Z[:, 1][i] - len_features])
    linkage_matrix['joined column2'] = joined_column2

    linkage_matrix['distance'] = [distance for distance in Z[:, 2]]
    linkage_matrix['number of original'] = [
        int(entities) for entities in Z[:, 3]
    ]
    linkage_matrix = linkage_matrix.reindex(
        index=linkage_matrix.index[::-1])[0:]

    # calculate full dendrogram

    plt.figure(figsize=(8.4, figure_height))
    dendrogram(Z,
               truncate_mode='none',
               get_leaves=True,
               orientation=orient,
               labels=data_names,
               leaf_rotation=45,
               leaf_font_size=10.,
               show_contracted=False)
    plt.title('Hierarchical Clustering Dendrogram')
    if orient == 'top':
        plt.xlabel('Samples')
        plt.ylabel('Distance')
    elif orient == 'right':
        plt.xlabel('Distance')
        plt.ylabel('Samples')
    plt.tight_layout()
    plt2 = plt2MD(plt)
    plt.clf()

    params = {
        'Input Columns': feature_names,
        'Input Mode': input_mode,
        'Linkage Method': link,
        'Metric': met,
        'Number of Rows in Linkage Matrix': num_rows
    }

    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""# Hierarchical Clustering Result"""))
    rb.addMD(
        strip_margin("""
    |### Dendrogram
    |
    |{image}
    |
    |### Parameters
    |
    |{display_params}
    |
    |### Linkage Matrix
    |
    |{out_table1}
    |
    """.format(image=plt2,
               display_params=dict2MD(params),
               out_table1=pandasDF2MD(linkage_matrix.head(num_rows),
                                      num_rows=num_rows + 1))))

    model = _model_dict('hierarchical_clustering')
    model['model'] = Z
    model['input_mode'] = input_mode
    model['table'] = out_table
    if input_mode == 'matrix':
        model['dist_matrix'] = dist_matrix
    model['parameters'] = params
    model['linkage_matrix'] = linkage_matrix
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
def _naive_bayes_train(table,
                       feature_cols,
                       label_col,
                       alpha=1.0,
                       fit_prior=True,
                       class_prior=None):
    feature_names, features = check_col_type(table, feature_cols)
    label = table[label_col]
    label_encoder = preprocessing.LabelEncoder()
    label_encoder.fit(label)
    label_correspond = label_encoder.transform(label)

    if class_prior is not None:
        tmp_class_prior = [0] * len(class_prior)
        for elems in class_prior:
            tmp = elems.split(":")
            tmp_class_prior[label_encoder.transform([tmp[0]
                                                     ])[0]] = float(tmp[1])
        class_prior = tmp_class_prior

    nb_model = MultinomialNB(alpha, fit_prior, class_prior)
    nb_model.fit(features, label_correspond)
    class_log_prior = nb_model.class_log_prior_
    feature_log_prob_ = nb_model.feature_log_prob_
    tmp_result = np.hstack(
        (list(map(list, zip(*[label_encoder.classes_] + [class_log_prior]))),
         (feature_log_prob_)))
    column_names = ['labels', 'pi']
    for feature_col in feature_names:
        column_names += ['theta_' + feature_col]
    result_table = pd.DataFrame.from_records(tmp_result, columns=column_names)
    prediction_correspond = nb_model.predict(features)

    get_param = dict()
    get_param['Lambda'] = alpha
    # get_param['Prior Probabilities of the Classes'] = class_prior
    get_param['Fit Class Prior Probability'] = fit_prior
    get_param['Feature Columns'] = feature_names
    get_param['Label Column'] = label_col

    cnf_matrix = confusion_matrix(label_correspond, prediction_correspond)

    plt.figure()
    _plot_confusion_matrix(cnf_matrix,
                           classes=label_encoder.classes_,
                           title='Confusion Matrix')
    fig_confusion_matrix = plt2MD(plt)
    accuracy = nb_model.score(features, label_correspond) * 100

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Naive Bayes Classification Result
    |
    | ### Model:Multinomial
    | {result_table}
    | ### Parameters
    | {table_parameter} 
    | ### Predicted vs Actual
    | {image1}
    | #### Accuacy = {accuracy}%
    |
    """.format(image1=fig_confusion_matrix,
               accuracy=accuracy,
               result_table=pandasDF2MD(result_table),
               table_parameter=dict2MD(get_param))))

    model = _model_dict('naive_bayes_model')
    model['features'] = feature_cols
    model['label_col'] = label_col
    model['label_encoder'] = label_encoder
    model['nb_model'] = nb_model
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Beispiel #29
0
def _tfidf(table, input_col, max_df=None, min_df=1, num_voca=1000, idf_weighting_scheme='inverseDocumentFrequency', norm='l2', smooth_idf=True, sublinear_tf=False, output_type=False):
    corpus = np.array(table[input_col])
    if max_df == None:
        max_df = len(corpus)
    tf_vectorizer = CountVectorizer(stop_words='english', max_df=max_df, min_df=min_df, max_features=num_voca)
    tf_vectorizer.fit(corpus)
    csr_matrix_tf = tf_vectorizer.transform(corpus)
    tfidf_vectorizer = TfidfTransformer(norm=norm, use_idf=True, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf)
    csr_matrix_tfidf = tfidf_vectorizer.fit_transform(csr_matrix_tf)

    voca_dict = sorted(tf_vectorizer.vocabulary_.items(), key=itemgetter(1))
    len_voca = len(voca_dict)
    
    # tf-idf table

    tfidf_table = pd.DataFrame()
    document_list = []
    docID_list = []
    if output_type == False:
        vocabulary_list = []
        label_table = pd.DataFrame()
        for doc in range(len(corpus)):
            docID_list += ['doc_{}'.format(doc) for _ in range(len_voca)]
            document_list += [str(corpus[doc]) for _ in range(len_voca)]
            vocabulary_list += [voca_dict[j][0] for j in range(len_voca)]
        label_table['document_id'] = docID_list
        label_table[input_col] = document_list
        label_table['vocabulary'] = vocabulary_list
        tfidf_table = label_table
        tfidf_table['frequency'] = np.ravel(csr_matrix_tf.todense())
        if idf_weighting_scheme == 'inverseDocumentFrequency':
            tfidf_table['tfidf score'] = np.ravel(csr_matrix_tfidf.todense())
        elif idf_weighting_scheme == 'unary':
            tfidf_table['tfidf score'] = list(map(float, np.array(tfidf_table['frequency'])))
    
    elif output_type == True:
        for doc in range(len(corpus)):
            docID_list += ['doc_{}'.format(doc) for _ in range(csr_matrix_tfidf.indptr[doc + 1] - csr_matrix_tfidf.indptr[doc])]
            document_list += [str(corpus[doc]) for _ in range(csr_matrix_tfidf.indptr[doc + 1] - csr_matrix_tfidf.indptr[doc])]
        tfidf_table['document_id'] = docID_list
        tfidf_table[input_col] = document_list
        tfidf_table['vocabulary'] = [voca_dict[i][0] for i in csr_matrix_tf.indices]
        tfidf_table['frequency'] = csr_matrix_tf.data
        data_list = []
        for doc in range(len(corpus)):
            data_list += [csr_matrix_tfidf.data[i]  for i in range(csr_matrix_tfidf.indptr[doc + 1] - csr_matrix_tfidf.indptr[doc])][::-1]
        if idf_weighting_scheme == 'inverseDocumentFrequency':
            tfidf_table['tfidf score'] = data_list
        elif idf_weighting_scheme == 'unary':
            tfidf_table['tfidf score'] = list(map(float, np.array(tfidf_table['frequency'])))

    else:
        raise_runtime_error("Please check 'output_type'.")
        
        # idf table
    
    idf_table = pd.DataFrame()
    idf_table['vocabulary'] = [voca_dict[j][0] for j in range(len(voca_dict))]
    if idf_weighting_scheme == 'inverseDocumentFrequency':
        idf_table['idf weight'] = tfidf_vectorizer.idf_.tolist()
    elif idf_weighting_scheme == 'unary':
        idf_table['idf weight'] = float(1)
        
    params = {
        'Input Column': input_col,
        'Max DF': max_df,
        'Min DF': min_df,
        'Number of Vocabularies': num_voca,
        'IDF Weighting Scheme': idf_weighting_scheme,
        'Norm': norm,
        'Smooth IDF': smooth_idf,
        'Sublinear TF': sublinear_tf,
        'Remove Zero Counts': output_type
    }
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""# TF-IDF Result"""))
    rb.addMD(strip_margin("""
    |
    |### Parameters
    |
    |{display_params}
    |
    |### IDF Table
    |
    |{idf_table}
    |
    |### TFIDF Table
    |
    |{tfidf_table}
    |
    """.format(display_params=dict2MD(params), idf_table=pandasDF2MD(idf_table, num_rows=200), tfidf_table=pandasDF2MD(tfidf_table, num_rows=200))))

    model = _model_dict('tfidf')
    model['csr_matrix_tf'] = csr_matrix_tf
    model['csr_matrix_tfidf'] = csr_matrix_tfidf
    model['parameter'] = params
    model['idf_table'] = idf_table
    model['tfidf_table'] = tfidf_table
    model['_repr_brtc_'] = rb.get()
    
    return {'model' : model}