Esempio n. 1
0
def classification_predict(table, model, prediction_col='prediction', prob_prefix='probability',
                                 output_log_prob=False, log_prob_prefix='log_probability', thresholds=None,
                                 suffix='index'):
    if '_grouped_data' in model:
        tmp_model = model['_grouped_data']['data']
        tmp_model = list(tmp_model.values())[0]
    else:
        tmp_model = model
    if 'logistic_regression_model' in tmp_model['_type'] or 'one_vs' in tmp_model['_type']:
        return logistic_regression_predict(table=table, model=model, prediction_col=prediction_col, prob_prefix=prob_prefix,
                                 output_log_prob=output_log_prob, log_prob_prefix=log_prob_prefix, thresholds=thresholds,
                                 suffix=suffix)
    if tmp_model['_type'] == 'svm_model':
        return svm_classification_predict(table=table, model=model, prediction_col=prediction_col, prob_prefix=prob_prefix,
                                 display_log_prob=output_log_prob, log_prob_prefix=log_prob_prefix, thresholds=thresholds,
                                 suffix=suffix)
    if tmp_model['_type'] == 'decision_tree_model':
        if 'method' in tmp_model and tmp_model['method'] == 'classification':
            return decision_tree_classification_predict(table=table, model=model, prediction_col=prediction_col)
    if 'tree_classification' in tmp_model['_type']:
        return decision_tree_classification_predict(table=table, model=model, prediction_col=prediction_col)
    if tmp_model['_type'] == 'random_forest_model':
        if 'method' in tmp_model and tmp_model['method'] == 'classification':
            return random_forest_classification_predict(table=table, model=model, pred_col_name=prediction_col)
    if 'forest_classification' in tmp_model['_type'] or 'gbt_classification' in tmp_model['_type']:
        return random_forest_classification_predict(table=table, model=model, pred_col_name=prediction_col)
    if tmp_model['_type'] == 'naive_bayes_model':
        return naive_bayes_predict(table=table, model=model, prediction_col=prediction_col, prob_prefix=prob_prefix,
                                 display_log_prob=output_log_prob, log_prob_prefix=log_prob_prefix, suffix=suffix)
    raise_runtime_error('''It is not supported yet.''')
Esempio n. 2
0
def _svm_classification_train(table, feature_cols, label_col, c=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True,
              probability=True, tol=1e-3, max_iter=-1, random_state=None):
    validate(greater_than(c, 0.0, 'c'))
    
    _table = table.copy()
    
    _feature_cols = _table[feature_cols]
    _label_col = _table[label_col]
    
    if(sklearn_utils.multiclass.type_of_target(_label_col) == 'continuous'):
        raise_runtime_error('''Label Column should not be continuous.''')
    
    _svc = svm.SVC(C=c, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking,
              probability=probability, tol=tol, max_iter=max_iter, random_state=random_state)
    _svc_model = _svc.fit(_feature_cols, _label_col)
    
    get_param = _svc.get_params()
    get_param['feature_cols'] = feature_cols
    get_param['label_col'] = label_col
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## SVM Classification Result
    | ### Parameters
    | {table_parameter} 
    """.format(table_parameter=dict2MD(get_param))))
    
    _model = _model_dict('svc_model')
    _model['svc_model'] = _svc_model
    _model['features'] = feature_cols
    _model['_repr_brtc_'] = rb.get()
    
    return {'model':_model}
Esempio n. 3
0
def _outlier_detection_tukey_carling_model(table, model, new_column_prefix='is_outlier_'):
    out_table = table.copy()
    input_cols = model['input_cols']
    outlier_method = model['outlier_method']
    result_type = model['result_type']
    output_col_names = []
    
    if outlier_method == 'tukey':
        for col in input_cols:
            output_col_name = '{prefix}{col}'.format(prefix=new_column_prefix, col=col)
            output_col_names.append(output_col_name)
            out_table[output_col_name] = out_table[col].apply(lambda _: _tukey(_, model['q1'][col], model['q3'][col], model['iqr'][col], model['multiplier']))
    elif outlier_method == 'carling':
        for col in input_cols:
            output_col_name = '{prefix}{col}'.format(prefix=new_column_prefix, col=col)
            output_col_names.append(output_col_name)
            out_table[output_col_name] = out_table[col].apply(lambda _: _carling(_, model['median'][col], model['iqr'][col], model['multiplier']))
    else:
        raise_runtime_error("Please check 'outlier_method'.")
        
    # result_type is one of 'add_prediction', 'remove_outliers', 'both'
    if result_type == 'add_prediction':
        pass
    elif result_type == 'remove_outliers':
        prediction = out_table[output_col_names].apply(lambda row: np.sum(row == 'out') < model['number_of_removal'], axis=1)
        out_table = out_table[prediction.values]
        out_table = out_table.drop(output_col_names, axis=1)
    elif result_type == 'both':
        prediction = out_table[output_col_names].apply(lambda row: np.sum(row == 'out') < model['number_of_removal'], axis=1)
        out_table = out_table[prediction.values]
    else:
        raise_runtime_error("Please check 'result_type'.")
    
    return {'out_table':out_table}
Esempio n. 4
0
def _doc_doc_mtx(table, model, input_col, result_type='sparse'):
    corpus = table[input_col].tolist()

    dictionary = model['dictionary']

    bow_corpus = []
    for doc in corpus:
        bow_corpus.append(dictionary.doc2bow(doc))

    csr_matrix = matutils.corpus2csc(bow_corpus).T
    csr_matrix.data = np.array([1 for _ in range(len(csr_matrix.data))])
    doc_doc = (csr_matrix @ (csr_matrix.T)).tocoo()

    if result_type == 'sparse':
        doc_doc = sparse.triu(doc_doc, k=1)
        out_table = pd.DataFrame(doc_doc.row, columns=['1st_document_idx'])
        out_table['2nd_document_idx'] = doc_doc.col
        out_table['number_of_common_terms'] = doc_doc.data
    elif result_type == 'dense':
        doc_idx = ['doc_{}'.format(i) for i in range(len(corpus))]
        out_table = pd.DataFrame(doc_doc.todense())
        out_table.insert(loc=0, column=' ', value=doc_idx)
        out_table.columns = np.append("", doc_idx)
    else:
        raise_runtime_error("Please check 'result_type'.")

    rb = BrtcReprBuilder()
    model = _model_dict('doc_doc_mtx')
    model['input_col'] = input_col
    model['doc_doc_mtx'] = doc_doc
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table}
Esempio n. 5
0
def join(left_table, right_table, left_on, right_on, how='inner', lsuffix='_left', rsuffix='_right', sort=False):
    if sort == True or sort == 'True' or sort == 'true':
        sort = True
    else:
        sort = False
    both_on = list(set(left_on) & set(right_on))
    if len(both_on) > 0 and how in ['outer', 'left', 'right']:
        left_table = left_table.rename(columns={key:key + lsuffix for key in both_on})
        right_table = right_table.rename(columns={key:key + rsuffix for key in both_on})
        left_on = [col_name + lsuffix if col_name in both_on else col_name for col_name in left_on]
        right_on = [col_name + rsuffix if col_name in both_on else col_name for col_name in right_on]
        
    if how == 'left_exclude':
        table = pd.merge(left_table, right_table, how='outer', left_on=left_on, right_on=right_on, suffixes=(lsuffix, rsuffix), sort=sort, indicator=True)
        left_columns = left_table.columns
        col_names_left = list(set(left_columns) & set(right_table.columns) - set(right_on))
        col_names = [col_name + lsuffix if col_name in col_names_left else col_name for col_name in left_columns]
        table = table[table['_merge'] == 'left_only'][col_names]
        table = table.rename(columns={col_name + lsuffix:col_name for col_name in col_names_left})
        if table.empty:
            raise_runtime_error("The result is empty.")
    elif how == 'right_exclude':
        table = pd.merge(left_table, right_table, how='outer', left_on=left_on, right_on=right_on, suffixes=(lsuffix, rsuffix), sort=sort, indicator=True)
        right_columns = right_table.columns
        col_names_right = list(set(left_table.columns) & set(right_columns) - set(right_on))
        col_names = [col_name + rsuffix if col_name in col_names_right else col_name for col_name in right_columns]
        table = table[table['_merge'] == 'right_only'][col_names]
        table = table.rename(columns={col_name + rsuffix:col_name for col_name in col_names_right})
        if table.empty:
            raise_runtime_error("The result is empty.")
    else:
        table = pd.merge(left_table, right_table, how=how, left_on=left_on, right_on=right_on, suffixes=(lsuffix, rsuffix), sort=sort)
    
    return {'table' : table}
Esempio n. 6
0
def read_from_db(datasource, sql):
    if sql is None:
        raise_runtime_error('sql is required parameter')

    with DbEngine(**datasource) as engine:
        df = pd.read_sql_query(sql, engine)
        util.validate_column_name(df)
        return {'table': df}
Esempio n. 7
0
 def process_text(text):
     if method == METHOD_NLTK:
         return process_text_nltk(text, ne_extraction_nltk)
     elif method == METHOD_SPACY:
         return process_text_spacy(text, ne_extraction_spacy)
     elif method == METHOD_STANFORD:
         return process_text_stanford(text)
     else:
         raise_runtime_error("Invalid method name.")
Esempio n. 8
0
def _representative_evaluation_value(table, input_col):
    col_name = input_col
    col_value =table.ix[0][col_name]

    if table.shape[0] > 1:
       raise_runtime_error("Only one column with one row is allowed")

    raw_data = "{'accuracy_index': " +  col_name + ",'accuracy_value': " + str(col_value) + " }"
    print(raw_data)
Esempio n. 9
0
def _outlier_detection_lof(table,
                           input_cols,
                           n_neighbors=20,
                           result_type='add_prediction',
                           new_column_name='is_outlier'):
    out_table = table.copy()
    features = out_table[input_cols]
    lof_model = LocalOutlierFactor(n_neighbors,
                                   algorithm='auto',
                                   leaf_size=30,
                                   metric='minkowski',
                                   p=2,
                                   novelty=True,
                                   contamination=0.1)
    lof_model.fit(features)

    isinlier = lambda _: 'in' if _ == 1 else 'out'
    out_table[new_column_name] = [
        isinlier(lof_predict) for lof_predict in lof_model.predict(features)
    ]

    if result_type == 'add_prediction':
        pass
    elif result_type == 'remove_outliers':
        out_table = out_table[out_table[new_column_name] == 'in']
        out_table = out_table.drop(new_column_name, axis=1)
    elif result_type == 'both':
        out_table = out_table[out_table[new_column_name] == 'in']
    else:
        raise_runtime_error("Please check 'result_type'.")

    params = {
        'Input Columns': input_cols,
        'Result Type': result_type,
        'Number of Neighbors': n_neighbors,
    }

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Outlier Detection (Local Outlier Factor) Result
    | ### Parameters
    |
    | {display_params}
    |
    """.format(display_params=dict2MD(params))))

    model = _model_dict('outlier_detection_lof')
    model['params'] = params
    model['lof_model'] = lof_model
    model['input_cols'] = input_cols
    model['result_type'] = result_type
    model['num_neighbors'] = n_neighbors
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'model': model}
Esempio n. 10
0
def clustering_predict(model, num_clusters, cluster_col='cluster'):
    if '_grouped_data' in model:
        tmp_model = model['_grouped_data']['data']
        tmp_model = list(tmp_model.values())[0]
    else:
        tmp_model = model
    if tmp_model['_type'] == 'hierarchical_clustering':
        return hierarchical_clustering_post(model=model,
                                            num_clusters=num_clusters,
                                            cluster_col=cluster_col)
    raise_runtime_error('''It is not supported yet.''')
Esempio n. 11
0
def regression_predict(table, model, prediction_col='prediction'):
    if '_grouped_data' in model:
        tmp_model = model['_grouped_data']['data']
        tmp_model = list(tmp_model.values())[0]
    else:
        tmp_model = model
    if 'linear_regression_model' in tmp_model['_type']:
        return linear_regression_predict(table=table,
                                         model=model,
                                         prediction_col=prediction_col)
    if tmp_model['_type'] == 'decision_tree_model':
        if 'method' in tmp_model and tmp_model['method'] == 'regression':
            return decision_tree_regression_predict(
                table=table, model=model, prediction_col=prediction_col)
    if 'tree_regression' in tmp_model['_type']:
        return decision_tree_regression_predict(table=table,
                                                model=model,
                                                prediction_col=prediction_col)
    if tmp_model['_type'] == 'random_forest_model':
        if 'method' in tmp_model and tmp_model['method'] == 'regression':
            return random_forest_regression_predict(
                table=table, model=model, prediction_col=prediction_col)
    if 'forest_regression' in tmp_model[
            '_type'] or 'gbt_regression' in tmp_model['_type']:
        return random_forest_regression_predict(table=table,
                                                model=model,
                                                prediction_col=prediction_col)
    if tmp_model['_type'] == 'ada_boost_regression_model':
        return ada_boost_regression_predict(table=table,
                                            model=model,
                                            pred_col_name=prediction_col)
    if tmp_model['_type'] == 'glm_model':
        return glm_predict(table=table,
                           model=model,
                           prediction_col=prediction_col)
    if tmp_model['_type'] == 'mlp_regression_model':
        return mlp_regression_predict(table=table,
                                      model=model,
                                      prediction_col=prediction_col)
    if tmp_model['_type'] == 'xgb_regression_model':
        return xgb_regression_predict(table=table,
                                      model=model,
                                      prediction_col=prediction_col)
    if tmp_model['_type'] == 'isotonic_regression_model':
        return isotonic_regression_predict(table=table,
                                           model=model,
                                           prediction_col=prediction_col)
    if tmp_model['_type'] == 'pls_regression_model':
        return pls_regression_predict(table=table,
                                      model=model,
                                      prediction_col=prediction_col)
    raise_runtime_error('''It is not supported yet.''')
Esempio n. 12
0
def _doc_term_mtx(table, model, input_col, result_type='doc_to_bow_token'):
    corpus = table[input_col].tolist()

    dictionary = model['dictionary']

    bow_corpus = []
    for doc in corpus:
        bow_corpus.append(dictionary.doc2bow(doc))

    doc_to_bow = []
    for i in range(len(corpus)):
        token_cnt = []
        for j in range(len(bow_corpus[i])):
            token_cnt.append('({token}, {cnt})'.format(
                token=dictionary[bow_corpus[i][j][0]],
                cnt=bow_corpus[i][j][1]))
        doc_to_bow.append(token_cnt)
    doc_to_bow_list = []
    for doc in doc_to_bow:
        doc_to_bow_list.append('{}'.format(list(doc)))

    doc_idx = ['doc_{}'.format(i) for i in range(len(corpus))]
    terms = [term for term in dictionary.token2id.keys()]

    if result_type == 'doc_to_bow_token':
        out_table = pd.DataFrame(data=doc_to_bow_list, columns=['doc_to_bow'])
        out_table.insert(loc=0, column='doc_idx', value=doc_idx)
    elif result_type == 'doc_term_mtx':
        out_table = pd.DataFrame(
            matutils.corpus2dense(bow_corpus,
                                  num_terms=len(dictionary.token2id)).T)
        out_table.insert(loc=0, column=' ', value=doc_idx)
        out_table.columns = np.append('', terms)
    elif result_type == 'term_doc_mtx':
        out_table = pd.DataFrame(
            matutils.corpus2dense(bow_corpus,
                                  num_terms=len(dictionary.token2id)))
        out_table.insert(loc=0, column=' ', value=terms)
        out_table.columns = np.append('', doc_idx)
    else:
        raise_runtime_error("Please check 'result_type'.")

    rb = BrtcReprBuilder()
    model = _model_dict('doc_term_mtx')
    model['bow_corpus'] = bow_corpus
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table}
Esempio n. 13
0
def image_load(path,
               labeling='dir',
               image_col='image',
               n_sample=None,
               size_limit=640,
               auto_resize_limit=False):
    def _is_big_image(img, limit):
        return max(img.height, img.width) > limit

    if labeling == 'dir':
        images_file_list = glob.glob('''{}/*/*'''.format(path))
        if n_sample is not None:
            images_file_list = random.sample(images_file_list, n_sample)
        npy_images = [(cv2.imread(x), x) for x in images_file_list]
        label = [
            os.path.split(os.path.dirname(os.path.abspath(x[1])))[1]
            for x in npy_images if x[0] is not None
        ]
    else:
        images_file_list = glob.glob('''{}/*'''.format(path))
        if n_sample is not None:
            images_file_list = random.sample(images_file_list, n_sample)
        npy_images = [(cv2.imread(x), x) for x in images_file_list]
        label = None

    loaded_images = [
        Image(x[0], origin=x[1]) for x in npy_images if x[0] is not None
    ]

    # check the size of loaded images
    if any([x for x in loaded_images if _is_big_image(x, size_limit)]):
        if auto_resize_limit:
            encoded_images = [
                x.resize_limit(size_limit).tobytes() for x in loaded_images
            ]
        else:
            raise_runtime_error(
                'Cannot load images with size over {}px.'.format(size_limit))
    else:
        encoded_images = [x.tobytes() for x in loaded_images]

    label_col = '{}_label'.format(image_col)
    out_df = pd.DataFrame({image_col: encoded_images})
    if label is not None:
        out_df[label_col] = label

    return {'out_table': out_df}
Esempio n. 14
0
def evaluate_ranking_algorithm(table1, table2, user_col, item_col, evaluation_measure, rating_col=None, rating_edge=None, k_values=None):
    none_str = 'None'
    item_encoder = preprocessing.LabelEncoder()
    tmp_table_item_col = table1[item_col].values.tolist()
    tmp_table_item_col.append(none_str)
    item_encoder.fit(tmp_table_item_col)
    if table2.columns[0] != 'user_name' and table2.columns[0] != 'user':
        raise_runtime_error("topN-list data schema should consist of [user_name, item_top1, rating_top1, .... item_topN, rating_topN]")
    user_encoder = preprocessing.LabelEncoder()
    user_encoder.fit(table2[table2.columns[0]])
    if rating_col is not None and rating_edge is not None:
        table = table1[table1[rating_col] > rating_edge]
    else:
        table = table1
    table = table[table[user_col].isin(user_encoder.classes_)]
    table_user_col = table[user_col]
    table_item_col = table[item_col]
    user_correspond = user_encoder.transform(table_user_col)
    item_correspond = item_encoder.transform(table_item_col)
    documents = dict()
    for i in range(len(user_encoder.classes_)):
        documents[i] = []
    for i in range(len(user_correspond)):
        documents[user_correspond[i]].append(item_correspond[i])
    columns = []
    for i in range(int(len(table2.columns) / 2)):
        if table2.columns[2 * i + 1] != 'item_%d' % (i + 1) and table2.columns[2 * i + 2] != 'rating_%d' % (i + 1) and table2.columns[2 * i + 1] != 'item_top%d' % (i + 1) and table2.columns[2 * i + 2] != 'rating_top%d' % (i + 1):
            raise_runtime_error("topN-list data schema should consist of [user_name, item_top1, rating_top1, .... item_topN, rating_topN]")
        columns.append(table2.columns[2 * i + 1])
    recommend_table = table2[columns].replace('', none_str).fillna(none_str).values
    for i in range(len(recommend_table)):
        recommend_table[i] = item_encoder.transform(recommend_table[i])
    result = []
    num_users = len(user_encoder.classes_)
    if k_values is not None:
        if 'prec' in evaluation_measure:
            for k_value in k_values:
                result.append(['precision_{}'.format(k_value), _precision_k(k_value, num_users, documents, recommend_table)])
        if 'ndcg' in evaluation_measure:
            for k_value in k_values:
                result.append(['ndcg_{}'.format(k_value), _ndcg_k(k_value, num_users, documents, recommend_table)])
    if 'map' in evaluation_measure:
        result.append(['meanAveragePrecision', _map(num_users, documents, recommend_table)])
    result = pd.DataFrame(result, columns=['measure', 'value'])
    return {'out_table':result}
Esempio n. 15
0
def _kmeans_predict(table, model, prediction_col='prediction'):
    if model['_context'] == 'python' and model['_type'] == 'kmeans':
        k_means = model['model']
        input_cols = model['input_cols']
        predict = k_means.predict(table[input_cols])
        out_table = table.copy()
        out_table[prediction_col] = predict
    elif model['_context'] == 'python' and model['_type'] == 'kmeans_silhouette':
        k_means = model['best_model']
        input_cols = model['input_cols']
        predict = k_means.predict(table[input_cols])
        out_table = table.copy()
        out_table[prediction_col] = predict
    else:
        raise_runtime_error("Unsupported model")
        # raise Exception("Unsupported model")
    
    return {'out_table':out_table}
Esempio n. 16
0
def read_from_db(datasource, sql):
    if sql is None:
        raise_runtime_error('sql is required parameter')
    import re
    sqlToken = re.sub(' +', ' ',
                      sql.lower().replace("(",
                                          " ( ").replace(")", " ) ")).replace(
                                              ". ", ".").split(" ")
    for i in range(len(sqlToken)):
        if sqlToken[i] == 'from':
            tmp_token = sqlToken[i + 1].split('.')
            if len(tmp_token) == 2 and tmp_token[1] in sys_table_lists:
                raise Exception(
                    'Cannot access system tables from Brightics: {}'.format(
                        sqlToken[i + 1]))
    with DbEngine(**datasource) as engine:
        df = pd.read_sql_query(sql, engine)
        util.validate_column_name(df)
        return {'table': df}
Esempio n. 17
0
def image_unload(table,
                 input_col,
                 path,
                 type='png',
                 label_col=None,
                 labelling='dir'):
    if not _is_image_col(table, input_col):
        raise_runtime_error(
            '{} is not an image type column.'.format(input_col))

    pathlib.Path(path).mkdir(parents=True, exist_ok=True)

    if label_col is None:
        for i, x in enumerate(table[input_col]):
            img_npy = Image.from_bytes(x).data
            # if type == 'png':
            out_file_name = '{}/{}.{}'.format(path, i, type)
            print(out_file_name)
            cv2.imwrite(out_file_name, img_npy)
Esempio n. 18
0
def unload(table, partial_path, mode="overwrite"):
    path = data_utils.make_data_path_from_key(partial_path[0])
    if path == gateway.data_root or path == gateway.data_root + '/':
        raise_runtime_error(
            'Please check a path String and a type of path. Cannot use a root of directory for the path.'
        )
    if os.path.isdir(path):
        shutil.rmtree(path)
    if mode == "append":
        try:
            old_frame = table_reader.read_parquet(
                util.make_data_path_from_key(partial_path[0]))
            new_frame = pd.concat([old_frame, table],
                                  axis=0,
                                  ignore_index=True)
            _write_dataframe(new_frame, path)
        except:
            _write_dataframe(table, path)
    else:
        _write_dataframe(table, path)
Esempio n. 19
0
def _outlier_detection_lof_model(table, model, new_column_name='is_outlier'):
    out_table = table.copy()
    result_type = model['result_type']

    isinlier = lambda _: 'in' if _ == 1 else 'out'
    out_table[new_column_name] = [isinlier(lof_predict) for lof_predict in model['lof_model'].predict(out_table[model['input_cols']])]
    
    # result_type is one of 'add_prediction', 'remove_outliers', 'both'
    
    if result_type == 'add_prediction':
        pass
    elif result_type == 'remove_outliers':
        out_table = out_table[out_table[new_column_name] == 'in']
        out_table = out_table.drop(new_column_name, axis=1)
    elif result_type == 'both':
        out_table = out_table[out_table[new_column_name] == 'in']
    else:
        raise_runtime_error("Please check 'result_type'.")      
        
    return {'out_table' : out_table}
Esempio n. 20
0
def _term_term_mtx(table, model, input_col, result_type='sparse'):
    corpus = table[input_col].tolist()

    dictionary = model['dictionary']

    bow_corpus = []
    for doc in corpus:
        bow_corpus.append(dictionary.doc2bow(doc))

    csr_matrix = matutils.corpus2csc(bow_corpus).T
    csr_matrix.data = np.array([1 for _ in range(len(csr_matrix.data))])
    term_term = (csr_matrix.T @ csr_matrix).tocoo()

    if result_type == 'sparse':
        term_term = sparse.triu(term_term, k=1)
        out_table = pd.DataFrame([dictionary[i] for i in term_term.row],
                                 columns=['term1'])
        out_table['term2'] = [dictionary[j] for j in term_term.col]
        out_table['number_of_documents_containing_terms'] = term_term.data

    elif result_type == 'dense':
        if model['add_words'] is None:
            model['add_words'] = []
        num_origin = len(dictionary) - len(model['add_words'])
        terms = [term for term in dictionary.token2id.keys()][:num_origin]
        doc_idx = ['doc_{}'.format(i) for i in range(len(corpus))]
        out_table = pd.DataFrame(term_term.todense())
        out_table.insert(loc=0, column=' ', value=terms)
        out_table.columns = np.append(" ", terms)

    else:
        raise_runtime_error("Please check 'result_type'.")

    rb = BrtcReprBuilder()
    model = _model_dict('term_term_mtx')
    model['term_term_mtx'] = term_term
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table}
Esempio n. 21
0
def _explode(table, input_col):
    col = table[input_col]
    is_arr_col_list = [
        isinstance(item, np.ndarray) or isinstance(item, list) for item in col
    ]
    is_non_arr_col_list = [not item for item in is_arr_col_list]
    is_arr_col = all(is_arr_col_list)
    is_non_arr_col = all(is_non_arr_col_list)
    if not is_arr_col and not is_non_arr_col:
        raise_runtime_error(
            "{} is an invalid column to explode or un-explode.".format(
                input_col))
    elif is_arr_col:  # explode
        values = np.array(col)
        values_flattened = np.concatenate(values).ravel()
        counts = [len(item) for item in values]
        col_exploded = pd.Series(values_flattened,
                                 index=col.index.repeat(counts),
                                 name=col.name)
        out_table = table.drop([input_col], axis=1).join(col_exploded).reindex(
            columns=table.columns, copy=False)
    else:  # un-explode
        group_cols = table.columns.tolist()
        group_cols.remove(input_col)

        group_id = 'tmp_group_id'
        while group_id in table.columns:
            group_id += '_'
        group_idx = table[group_cols].drop_duplicates()
        group_idx[group_id] = np.arange(group_idx.shape[0])
        table = table.merge(group_idx, on=group_cols)

        out_table = table.groupby(group_id)[input_col].apply(list).reset_index(name=input_col) \
            .merge(group_idx, on=group_id).reindex(columns=table.columns).drop(group_id, axis=1)
        out_table[input_col] = out_table[input_col].map(
            lambda lst: [item for item in lst if not isna(item)])

    return {'out_table': out_table}
Esempio n. 22
0
def _outlier_detection_tukey_carling(table,
                                     input_cols,
                                     outlier_method='tukey',
                                     multiplier=None,
                                     number_of_removal=1,
                                     result_type='add_prediction',
                                     new_column_prefix='is_outlier_'):
    out_table = table.copy()
    median = out_table.median()
    q1s = out_table.quantile(0.25)
    q3s = out_table.quantile(0.75)
    iqrs = q3s - q1s
    output_col_names = []

    if outlier_method == 'tukey':
        if multiplier is None:
            multiplier = 1.5
        for col in input_cols:
            output_col_name = '{prefix}{col}'.format(prefix=new_column_prefix,
                                                     col=col)
            output_col_names.append(output_col_name)
            out_table[output_col_name] = out_table[col].apply(
                lambda _: _tukey(_, q1s[col], q3s[col], iqrs[col], multiplier))
    elif outlier_method == 'carling':
        if multiplier is None:
            multiplier = 2.3
        for col in input_cols:
            output_col_name = '{prefix}{col}'.format(prefix=new_column_prefix,
                                                     col=col)
            output_col_names.append(output_col_name)
            out_table[output_col_name] = out_table[col].apply(
                lambda _: _carling(_, median[col], iqrs[col], multiplier))
    else:
        raise_runtime_error("Please check 'outlier_method'.")

    # result_type is one of 'add_prediction', 'remove_outliers', 'both'
    if result_type == 'add_prediction':
        pass
    elif result_type == 'remove_outliers':
        prediction = out_table[output_col_names].apply(
            lambda row: np.sum(row == 'out') < number_of_removal, axis=1)
        out_table = out_table[prediction.values]
        out_table = out_table.drop(output_col_names, axis=1)
    elif result_type == 'both':
        prediction = out_table[output_col_names].apply(
            lambda row: np.sum(row == 'out') < number_of_removal, axis=1)
        out_table = out_table[prediction.values]
    else:
        raise_runtime_error("Please check 'result_type'.")

    params = {
        'Input Columns': input_cols,
        'Outlier Method': outlier_method,
        'Multiplier': multiplier,
        'Number of Outliers in a Row': number_of_removal,
        'Result Type': result_type,
        'New Column Prefix': new_column_prefix
    }

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Outlier Detection (Tukey/Carling) Result
    | ### Parameters
    |
    | {display_params}
    |
    """.format(display_params=dict2MD(params))))

    model = _model_dict('outlier_detection_tukey_carling')
    model['params'] = params
    model['input_cols'] = input_cols
    model['outlier_method'] = outlier_method
    model['multiplier'] = multiplier
    model['number_of_removal'] = number_of_removal
    model['result_type'] = result_type
    model['median'] = median
    model['q1'] = q1s
    model['q3'] = q3s
    model['iqr'] = iqrs
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'model': model}
Esempio n. 23
0
def _check_image_col(table, input_col):
    if not _is_image_col(table, input_col):
        raise_runtime_error(
            "input column {} is not an image column.".format(input_col))
Esempio n. 24
0
def _cross_table(table, input_cols_1, input_cols_2, result='N', margins=False):

    df1 = [table[col] for col in input_cols_1]
    df2 = [table[col] for col in input_cols_2]

    # cross table
    if result == 'N':
        result_table = pd.crosstab(df1, df2, margins=margins)
    elif result == 'N / Row Total':
        result_table = pd.crosstab(df1,
                                   df2,
                                   margins=margins,
                                   normalize='index')
    elif result == 'N / Column Total':
        result_table = pd.crosstab(df1,
                                   df2,
                                   margins=margins,
                                   normalize='columns')
    elif result == 'N / Total':
        result_table = pd.crosstab(df1, df2, margins=margins, normalize='all')
    else:
        raise_runtime_error("Please check 'result'.")

    # each row and column name
    row_names = list(result_table.index)[:]
    if len(input_cols_1) == 1:
        joined_row_name = [str(i) for i in row_names]
    else:
        if margins == False:
            joined_row_name = [
                '_'.join(str(s) for s in row_names[i])
                for i in range(len(row_names))
            ]
        elif margins == True:
            joined_row_name = [
                '_'.join(str(s) for s in row_names[i])
                for i in range(len(row_names) - 1)
            ] + [row_names[-1][0]]

    column_names = list(result_table.columns)[:]
    if len(input_cols_2) == 1:
        joined_column_name = [str(i) for i in column_names]
    else:
        if margins == False:
            joined_column_name = [
                '_'.join(str(s) for s in column_names[i])
                for i in range(len(column_names))
            ]
        elif margins == True:
            joined_column_name = [
                '_'.join(str(s) for s in column_names[i])
                for i in range(len(column_names) - 1)
            ] + [column_names[-1][0]]

    # cross table
    if result == 'N':
        result_table.insert(loc=0, column=' ', value=joined_row_name)
        result_table.columns = np.append('N', joined_column_name)
    # cross table normalize by row
    elif result == 'N / Row Total':
        result_table.insert(loc=0, column=' ', value=joined_row_name)
        result_table.columns = np.append('N / Row Total', joined_column_name)
    # cross table normalize by column
    elif result == 'N / Column Total':
        result_table.insert(loc=0, column=' ', value=joined_row_name)
        result_table.columns = np.append('N / Column Total',
                                         joined_column_name)
    # cross table normalize by all values
    elif result == 'N / Total':
        result_table.insert(loc=0, column=' ', value=joined_row_name)
        result_table.columns = np.append('N / Total', joined_column_name)
    else:
        raise_runtime_error("Please check 'result'.")

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Cross Table Result
    | ### Result Type : {result}
    |
    | #### Result Table
    |
    | {result_table}
    |
    """.format(result=result,
               result_table=pandasDF2MD(result_table,
                                        num_rows=len(result_table.index) +
                                        1))))

    model = _model_dict('cross_table')
    model['result'] = result
    model['result_table'] = result_table
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Esempio n. 25
0
def raise_runtime_error(error_message, true_condition=False):
    common_validation.raise_runtime_error(error_message, true_condition)
def _penalized_linear_regression_train(table,
                                       feature_cols,
                                       label_col,
                                       regression_type='ridge',
                                       alpha=1.0,
                                       l1_ratio=0.5,
                                       fit_intercept=True,
                                       max_iter=1000,
                                       tol=0.0001,
                                       random_state=None):
    out_table = table.copy()
    feature_names, features = check_col_type(out_table, feature_cols)
    label = out_table[label_col]
    if regression_type == 'ridge':
        regression_model = Ridge(alpha=alpha,
                                 fit_intercept=fit_intercept,
                                 max_iter=None,
                                 tol=tol,
                                 solver='auto',
                                 random_state=random_state)
    elif regression_type == 'lasso':
        regression_model = Lasso(alpha=alpha,
                                 fit_intercept=fit_intercept,
                                 max_iter=max_iter,
                                 tol=tol,
                                 random_state=random_state,
                                 selection='random')
    elif regression_type == 'elastic_net':
        regression_model = ElasticNet(alpha=alpha,
                                      l1_ratio=l1_ratio,
                                      fit_intercept=fit_intercept,
                                      max_iter=max_iter,
                                      tol=tol,
                                      random_state=random_state,
                                      selection='random')
    else:
        raise_runtime_error("Please check 'regression_type'.")

    regression_model.fit(features, label)

    out_table1 = pd.DataFrame([])
    out_table1['x_variable_name'] = [variable for variable in feature_names]
    out_table1['coefficient'] = regression_model.fit(features, label).coef_
    intercept = pd.DataFrame(
        [['intercept',
          regression_model.fit(features, label).intercept_]],
        columns=['x_variable_name', 'coefficient'])
    if fit_intercept == True:
        out_table1 = out_table1.append(intercept, ignore_index=True)

    predict = regression_model.predict(features)
    residual = label - predict

    out_table['predict'] = predict
    out_table['residual'] = residual

    if regression_type == 'elastic_net':
        params = {
            'Feature Columns': feature_names,
            'Label Column': label_col,
            'Regression Type': regression_type,
            'Regularization (Penalty Weight)': alpha,
            'L1 Ratio': l1_ratio,
            'Fit Intercept': fit_intercept,
            'Maximum Number of Iterations': max_iter,
            'Tolerance': tol
        }
    else:
        params = {
            'Feature Columns': feature_names,
            'Label Column': label_col,
            'Regression Type': regression_type,
            'Regularization (Penalty Weight)': alpha,
            'Fit Intercept': fit_intercept,
            'Maxium Number of Iterations': max_iter,
            'Tolerance': tol
        }

    score = {
        'MSE': mean_squared_error(label, predict),
        'R2': r2_score(label, predict)
    }

    plt.figure()
    plt.scatter(predict, label)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Actual values for ' + label_col)
    x = predict
    p1x = np.min(x)
    p2x = np.max(x)
    plt.plot([p1x, p2x], [p1x, p2x], 'r--')
    fig_actual_predict = plt2MD(plt)
    plt.clf()

    plt.figure()
    plt.scatter(predict, residual)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Residuals')
    plt.axhline(y=0, color='r', linestyle='--')
    fig_residual_1 = plt2MD(plt)
    plt.clf()

    plt.figure()
    sm.qqplot(residual, line='s')
    plt.ylabel('Residuals')
    fig_residual_2 = plt2MD(plt)
    plt.clf()

    plt.figure()
    sns.distplot(residual)
    plt.xlabel('Residuals')
    fig_residual_3 = plt2MD(plt)
    plt.clf()

    # checking the magnitude of coefficients

    plt.figure()
    predictors = feature_names
    coef = Series(regression_model.coef_, predictors).sort_values()
    coef.plot(kind='bar', title='Model Coefficients')
    plt.tight_layout()
    fig_model_coefficients = plt2MD(plt)
    plt.clf()

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | # Penalized Linear Regression Result
    | ### Selected Parameters: 
    | {params}
    |
    | ## Results
    | ### Model Parameters
    | {out_table1}
    |
    | ### Regression Score
    | {score}
    |
    """.format(params=dict2MD(params),
               out_table1=pandasDF2MD(out_table1),
               score=dict2MD(score))))
    rb.addMD(
        strip_margin("""
    |
    | ### Predicted vs Actual
    | {image1}
    |
    | ### Fit Diagnostics
    | {image2}
    | {image3}
    | {image4}
    |
    | ### Magnitude of Coefficients
    | {image5}
    |
    """.format(image1=fig_actual_predict,
               image2=fig_residual_1,
               image3=fig_residual_2,
               image4=fig_residual_3,
               image5=fig_model_coefficients)))

    model = _model_dict('penalized_linear_regression_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    model['regression_type'] = regression_type
    model['regression_model'] = regression_model
    model['parameters'] = params
    model['model_parameters'] = out_table1
    model['prediction_residual'] = out_table
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Esempio n. 27
0
def _lda4(table,
          input_col,
          topic_name='topic',
          num_voca=1000,
          num_topic=5,
          num_topic_word=10,
          max_iter=20,
          learning_method='online',
          learning_offset=10.,
          random_state=None):
    # generate model
    corpus = np.array(table[input_col])
    if isinstance(corpus[0], np.ndarray):
        tf_vectorizer = CountVectorizer(preprocessor=' '.join,
                                        stop_words='english',
                                        max_df=0.95,
                                        min_df=2,
                                        max_features=num_voca)
    else:
        tf_vectorizer = CountVectorizer(max_df=0.95,
                                        min_df=2,
                                        max_features=num_voca,
                                        stop_words='english')
    term_count = tf_vectorizer.fit_transform(corpus)
    tf_feature_names = tf_vectorizer.get_feature_names()
    if learning_method == 'online':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            learning_offset=learning_offset,
            random_state=random_state).fit(term_count)
    elif learning_method == 'batch':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            random_state=random_state).fit(term_count)
    else:
        raise_runtime_error("Please check 'learning_method'.")
    log_likelihood = lda_model.score(term_count)
    perplexity = lda_model.perplexity(term_count)

    # create topic table
    vocab_weights_list = []
    vocab_list = []
    weights_list = []
    topic_term_prob = normalize(lda_model.components_, norm='l1')
    for vector in topic_term_prob:
        pairs = []
        for term_idx, value in enumerate(vector):
            pairs.append((abs(value), tf_feature_names[term_idx]))
        pairs.sort(key=lambda x: x[0], reverse=True)
        vocab_weights = []
        vocab = []
        weights = []
        for pair in pairs[:num_topic_word]:
            vocab_weights.append("{}: {}".format(pair[1], pair[0]))
            vocab.append(pair[1])
            weights.append(pair[0])
        vocab_weights_list.append(vocab_weights)
        vocab_list.append(vocab)
        weights_list.append(weights)
    topic_table = pd.DataFrame({
        'vocabularies_weights': vocab_weights_list,
        'vocabularies': vocab_list,
        'weights': weights_list
    })
    topic_table['index'] = [idx + 1 for idx in topic_table.index]
    topic_table = topic_table[[
        'index', 'vocabularies_weights', 'vocabularies', 'weights'
    ]]

    # create output table
    doc_topic = lda_model.transform(term_count)
    out_table = pd.DataFrame.copy(table, deep=True)
    topic_dist_name = topic_name + '_distribution'
    if topic_name in table.columns or topic_dist_name in table.columns:
        raise BrighticsFunctionException.from_errors([{
            '0100':
            "Existing table contains Topic Column Name. Please choose again."
        }])
    out_table[topic_name] = [
        doc_topic[i].argmax() + 1 for i in range(len(corpus))
    ]
    out_table[topic_dist_name] = doc_topic.tolist()

    # pyLDAvis
    prepared_data = ldavis.prepare(lda_model, term_count, tf_vectorizer)
    html_result = pyLDAvis.prepared_data_to_html(prepared_data)

    # generate report
    params = {
        'Input column': input_col,
        'Topic column name': topic_name,
        'Number of topics': num_topic,
        'Number of words for each topic': num_topic_word,
        'Maximum number of iterations': max_iter,
        'Learning method': learning_method,
        'Learning offset': learning_offset,
        'Seed': random_state
    }
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Latent Dirichlet Allocation Result
    | ### Summary
    |
    """))
    rb.addHTML(html_result)
    rb.addMD(
        strip_margin("""
    |
    | ### Log Likelihood
    | {log_likelihood}
    |
    | ### Perplexity
    | {perplexity}
    |
    | ### Parameters
    | {params}
    """.format(log_likelihood=log_likelihood,
               perplexity=perplexity,
               params=dict2MD(params))))

    # create model
    model = _model_dict('lda_model')
    model['params'] = params
    model['lda_model'] = lda_model
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
Esempio n. 28
0
def _lda(table,
         input_col,
         num_voca=1000,
         num_topic=3,
         num_topic_word=3,
         max_iter=20,
         learning_method='online',
         learning_offset=10.,
         random_state=None):
    corpus = table[input_col]
    tf_vectorizer = CountVectorizer(max_df=0.95,
                                    min_df=2,
                                    max_features=num_voca,
                                    stop_words='english')
    term_count = tf_vectorizer.fit_transform(corpus)
    tf_feature_names = tf_vectorizer.get_feature_names()

    if learning_method == 'online':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            learning_offset=learning_offset,
            random_state=random_state).fit(term_count)
    elif learning_method == 'batch':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            random_state=random_state).fit(term_count)
    else:
        raise_runtime_error("Please check 'learning_method'.")

    topic_model = pd.DataFrame([])
    topic_idx_list = []
    voca_weights_list = []
    for topic_idx, weights in enumerate(lda_model.components_):
        topic_idx_list.append("Topic {}".format(topic_idx))
        pairs = []
        for term_idx, value in enumerate(weights):
            pairs.append((abs(value), tf_feature_names[term_idx]))
        pairs.sort(key=lambda x: x[0], reverse=True)
        voca_weights = []
        for pair in pairs[:num_topic_word]:
            voca_weights.append("{}: {}".format(pair[1], pair[0]))
        voca_weights_list.append(voca_weights)
    topic_model['topic idx'] = topic_idx_list
    topic_model['topic vocabularies'] = voca_weights_list

    doc_topic = lda_model.transform(term_count)

    doc_classification = pd.DataFrame()
    doc_classification['documents'] = [doc for doc in corpus]
    doc_classification['top topic'] = [
        "Topic {}".format(doc_topic[i].argmax()) for i in range(len(corpus))
    ]

    params = {
        'Input Column': input_col,
        'Number of Vocabularies': num_voca,
        'Number of Topics': num_topic,
        'Number of Terminologies': num_topic_word,
        'Iterations': max_iter,
        'Learning Method': learning_method,
    }

    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""# Latent Dirichlet Allocation Result"""))
    rb.addMD(
        strip_margin("""
    |
    |### Parameters
    |
    | {display_params}
    |
    |### Topic Model
    |
    |{topic_model}
    |
    |### Documents Classification
    |
    |{doc_classification}
    |
    """.format(display_params=dict2MD(params),
               topic_model=pandasDF2MD(topic_model, num_rows=num_topic + 1),
               doc_classification=pandasDF2MD(doc_classification,
                                              num_rows=len(corpus) + 1))))

    model = _model_dict('lda')
    model['parameter'] = params
    model['topic_model'] = topic_model
    model['documents_classification'] = doc_classification
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Esempio n. 29
0
def _lda3(table,
          input_col,
          topic_name='topic',
          num_voca=1000,
          num_topic=3,
          num_topic_word=3,
          max_iter=20,
          learning_method='online',
          learning_offset=10.,
          random_state=None):
    corpus = np.array(table[input_col])
    if isinstance(corpus[0], np.ndarray):
        tf_vectorizer = CountVectorizer(preprocessor=' '.join,
                                        stop_words='english',
                                        max_df=0.95,
                                        min_df=2,
                                        max_features=num_voca)
    else:
        tf_vectorizer = CountVectorizer(max_df=0.95,
                                        min_df=2,
                                        max_features=num_voca,
                                        stop_words='english')
    term_count = tf_vectorizer.fit_transform(corpus)
    tf_feature_names = tf_vectorizer.get_feature_names()

    if learning_method == 'online':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            learning_offset=learning_offset,
            random_state=random_state).fit(term_count)
    elif learning_method == 'batch':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            random_state=random_state).fit(term_count)
    else:
        raise_runtime_error("Please check 'learning_method'.")
    voca_weights_list = []
    for weights in lda_model.components_:
        pairs = []
        for term_idx, value in enumerate(weights):
            pairs.append((abs(value), tf_feature_names[term_idx]))
        pairs.sort(key=lambda x: x[0], reverse=True)
        voca_weights = []
        for pair in pairs[:num_topic_word]:
            voca_weights.append("{}: {}".format(pair[1], pair[0]))
        voca_weights_list.append(voca_weights)

    doc_topic = lda_model.transform(term_count)
    out_table = pd.DataFrame.copy(table, deep=True)
    if topic_name in table.columns:
        raise BrighticsFunctionException.from_errors([{
            '0100':
            "Existing table contains Topic Column Name. Please choose again."
        }])
    out_table[topic_name] = [doc_topic[i].argmax() for i in range(len(corpus))]
    weight_list = []
    for ind in out_table[topic_name]:
        weight_list.append(voca_weights_list[ind])
    out_table['topic_vocabularies'] = weight_list
    return {'out_table': out_table}
Esempio n. 30
0
def _hierarchical_clustering(table,
                             input_cols,
                             input_mode='original',
                             key_col=None,
                             link='complete',
                             met='euclidean',
                             num_rows=20,
                             figure_height=6.4,
                             orient='right'):
    out_table = table.copy()
    features = out_table[input_cols]

    if input_mode == 'original':
        len_features = len(features)
        if key_col != None:
            data_names = list(out_table[key_col])
        elif key_col == None:
            data_names = ['pt_' + str(i) for i in range(len_features)]
        out_table['name'] = data_names
        Z = linkage(ssd.pdist(features, metric=met), method=link, metric=met)
    elif input_mode == 'matrix':
        len_features = len(input_cols)
        if key_col != None:
            data_names = []
            for column in input_cols:
                data_names.append(
                    out_table[key_col][out_table.columns.get_loc(column)])
        elif key_col == None:
            data_names = []
            for column in input_cols:
                data_names.append(
                    out_table.columns[out_table.columns.get_loc(column)])
        col_index = []
        for column in input_cols:
            col_index.append(out_table.columns.get_loc(column))
        dist_matrix = features.iloc[col_index]

        Z = linkage(ssd.squareform(dist_matrix), method=link, metric=met)
        dist_matrix['name'] = data_names
    else:
        raise_runtime_error("Please check 'input_mode'.")

    range_len_Z = range(len(Z))
    linkage_matrix = pd.DataFrame([])
    linkage_matrix['linkage step'] = [
        '%g' % (x + 1) for x in reversed(range_len_Z)
    ]
    linkage_matrix['name of clusters'] = [
        'CL_%g' % (i + 1) for i in reversed(range_len_Z)
    ]
    joined_column1 = []
    for i in range_len_Z:
        if Z[:, 0][i] < len_features:
            joined_column1.append(data_names[int(Z[:, 0][i])])
        elif Z[:, 0][i] >= len_features:
            joined_column1.append(
                linkage_matrix['name of clusters'][Z[:, 0][i] - len_features])
    linkage_matrix['joined column1'] = joined_column1
    joined_column2 = []
    for i in range_len_Z:
        if Z[:, 1][i] < len_features:
            joined_column2.append(data_names[int(Z[:, 1][i])])
        elif Z[:, 1][i] >= len_features:
            joined_column2.append(
                linkage_matrix['name of clusters'][Z[:, 1][i] - len_features])
    linkage_matrix['joined column2'] = joined_column2

    linkage_matrix['distance'] = [distance for distance in Z[:, 2]]
    linkage_matrix['number of original'] = [
        int(entities) for entities in Z[:, 3]
    ]
    linkage_matrix = linkage_matrix.reindex(
        index=linkage_matrix.index[::-1])[0:]

    # calculate full dendrogram

    plt.figure(figsize=(8.4, figure_height))
    dendrogram(Z,
               truncate_mode='none',
               get_leaves=True,
               orientation=orient,
               labels=data_names,
               leaf_rotation=45,
               leaf_font_size=10.,
               show_contracted=False)
    plt.title('Hierarchical Clustering Dendrogram')
    if orient == 'top':
        plt.xlabel('Samples')
        plt.ylabel('Distance')
    elif orient == 'right':
        plt.xlabel('Distance')
        plt.ylabel('Samples')
    plt.tight_layout()
    plt2 = plt2MD(plt)
    plt.clf()

    params = {
        'Input Columns': input_cols,
        'Input Mode': input_mode,
        'Linkage Method': link,
        'Metric': met,
        'Number of Rows in Linkage Matrix': num_rows
    }

    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""# Hierarchical Clustering Result"""))
    rb.addMD(
        strip_margin("""
    |### Dendrogram
    |
    |{image}
    |
    |### Parameters
    |
    |{display_params}
    |
    |### Linkage Matrix
    |
    |{out_table1}
    |
    """.format(image=plt2,
               display_params=dict2MD(params),
               out_table1=pandasDF2MD(linkage_matrix.head(num_rows),
                                      num_rows=num_rows + 1))))

    model = _model_dict('hierarchical_clustering')
    model['model'] = Z
    model['input_mode'] = input_mode
    model['table'] = out_table
    if input_mode == 'matrix':
        model['dist_matrix'] = dist_matrix
    model['parameters'] = params
    model['linkage_matrix'] = linkage_matrix
    model['_repr_brtc_'] = rb.get()

    return {'model': model}