def _label_encoder2(table, input_cols, suffix='_index'): out_table = table.copy() out_model_list = [None] * len(input_cols) new_col_list = [] number_distinct_classes = [] for ind, col in enumerate(input_cols): le = LabelEncoder().fit(table[col]) out_model_list[ind] = le new_col_name = col + suffix new_col_list.append(new_col_name) number_distinct_classes.append(len(le.classes_)) out_table[new_col_name] = le.transform(table[col]) out_model = _model_dict('label_encoders') out_model['label_encoders'] = out_model_list out_model['input_cols'] = input_cols rb = BrtcReprBuilder() params = {"Input columns": input_cols, "Suffix": suffix} summary_table = pd.DataFrame() summary_table['Input columns'] = input_cols summary_table['No. distinct classes'] = number_distinct_classes summary_table['New column names'] = new_col_list rb.addMD( strip_margin(""" | ## Label Encoder Model | ### Parameters | {params} | ### Summary | {summary_table} """.format(params=dict2MD(params), summary_table=pandasDF2MD(summary_table)))) out_model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'model': out_model}
def _ada_boost_classification_train(table, feature_cols, label_col, max_depth=1, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None): x_train = table[feature_cols] y_train = table[label_col] base_estimator = DecisionTreeClassifier(max_depth=max_depth) classifier = AdaBoostClassifier(base_estimator, n_estimators, learning_rate, algorithm, random_state) classifier.fit(x_train, y_train) params = { 'feature_cols': feature_cols, 'label_col': label_col, 'feature_importance': classifier.feature_importances_, 'n_estimators': n_estimators, 'learning_rate': learning_rate, 'algorithm': algorithm, 'random_state': random_state } model = _model_dict('ada_boost_classification_model') get_param = classifier.get_params() model['parameters'] = get_param model['classifier'] = classifier model['params'] = params fig_feature_importance = _plot_feature_importance(feature_cols, classifier) params = dict2MD(get_param) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## AdaBoost Classification Train Result | | ### Feature Importance | {fig_feature_importance} | | ### Parameters | {list_parameters} | """.format(fig_feature_importance=fig_feature_importance, list_parameters=params))) model['_repr_brtc_'] = rb.get() feature_importance = classifier.feature_importances_ feature_importance_table = pd.DataFrame( [[feature_cols[i], feature_importance[i]] for i in range(len(feature_cols))], columns=['feature_name', 'importance']) model['feature_importance_table'] = feature_importance_table return {'model': model}
def _svm_classification_train(table, feature_cols, label_col, c=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=True, tol=1e-3, max_iter=-1, random_state=None): validate(greater_than(c, 0.0, 'c')) _table = table.copy() _feature_cols = _table[feature_cols] _label_col = _table[label_col] if(sklearn_utils.multiclass.type_of_target(_label_col) == 'continuous'): raise_runtime_error('''Label Column should not be continuous.''') _svc = svm.SVC(C=c, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking, probability=probability, tol=tol, max_iter=max_iter, random_state=random_state) _svc_model = _svc.fit(_feature_cols, _label_col) get_param = _svc.get_params() get_param['feature_cols'] = feature_cols get_param['label_col'] = label_col rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## SVM Classification Result | ### Parameters | {table_parameter} """.format(table_parameter=dict2MD(get_param)))) _model = _model_dict('svc_model') _model['svc_model'] = _svc_model _model['features'] = feature_cols _model['_repr_brtc_'] = rb.get() return {'model':_model}
def _bow(table, input_col, add_words=None, no_below=1, no_above=0.8, keep_n=10000): word_list = table[input_col].tolist() dictionary = Dictionary(word_list) if add_words != None: dictionary.add_documents([add_words]) dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n, keep_tokens=None) params = { 'Input Column': input_col, 'Minimum Number of Occurrence': no_below, 'Maximum Fraction of Occurrence': no_above, 'Keep N most Frequent': keep_n } empty_description = '' if len(list(dictionary.dfs.values())) == 0: out_table = pd.DataFrame([], columns=['token', 'document_frequency']) empty_description = 'Out table is empty since parameter \"Minimum Number of Occurrence\" is greater than the maximum of document frequency.' else: out_table = pd.DataFrame.from_dict(dictionary.token2id, orient='index').drop([0], axis=1) out_table.insert(loc=0, column='token', value=dictionary.token2id.keys()) token_cnt = sorted(dictionary.dfs.items(), key=operator.itemgetter(0)) dfs_list = [] for i in range(len(dictionary.dfs)): dfs_list.append(token_cnt[i][1]) out_table['document_frequency'] = dfs_list rb = BrtcReprBuilder() rb.addMD( strip_margin(""" |# Bag of Words Result |### Parameters | | {display_params} | | {description} | """.format(display_params=dict2MD(params), description=empty_description))) model = _model_dict('bow') model['dict_table'] = out_table model['dictionary'] = dictionary model['add_words'] = add_words model['_repr_brtc_'] = rb.get() return {'model': model, 'out_table': out_table}
def _ada_boost_regression_train(table, feature_cols, label_col, max_depth=3, n_estimators=50, learning_rate=1.0, loss='linear', random_state=None): feature_names, x_train = check_col_type(table, feature_cols) y_train = table[label_col] base_estimator = DecisionTreeRegressor(max_depth=max_depth) regressor = AdaBoostRegressor(base_estimator, n_estimators, learning_rate, loss, random_state) regressor.fit(x_train, y_train) params = { 'feature_cols': feature_cols, 'label_col': label_col, 'feature_importance': regressor.feature_importances_, 'n_estimators': n_estimators, 'learning_rate': learning_rate, 'loss': loss, 'random_state': random_state } model = _model_dict('ada_boost_regression_model') get_param = regressor.get_params() model['parameters'] = get_param model['regressor'] = regressor model['params'] = params fig_feature_importance = _plot_feature_importance(feature_names, regressor) params = dict2MD(get_param) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## AdaBoost Regression Train Result | | ### Feature Importance | {fig_feature_importance} | | ### Parameters | {list_parameters} | """.format(fig_feature_importance=fig_feature_importance, list_parameters=params))) model['_repr_brtc_'] = rb.get() feature_importance = regressor.feature_importances_ feature_importance_table = pd.DataFrame( [[feature_names[i], feature_importance[i]] for i in range(len(feature_names))], columns=['feature_name', 'importance']) model['feature_importance_table'] = feature_importance_table return {'model': model}
def _hierarchical_clustering_post(table, model, num_clusters, cluster_col='prediction'): Z = model['model'] mode = model['input_mode'] if mode == 'matrix': distance_matrix = model['dist_matrix'] out_table = model['linkage_matrix'] predict = fcluster(Z, t=num_clusters, criterion='maxclust') if mode == 'original': prediction_table = table.copy() elif mode == 'matrix': prediction_table = distance_matrix prediction_table[cluster_col] = predict L, M = leaders(Z, predict) which_cluster = [] for leader in L: if leader in Z[:, 0]: select_indices = np.where(Z[:, 0] == leader)[0][0] which_cluster.append(out_table['joined_column1'][select_indices]) elif leader in Z[:, 1]: select_indices = np.where(Z[:, 1] == leader)[0][0] which_cluster.append(out_table['joined_column2'][select_indices]) clusters_info_table = pd.DataFrame([]) clusters_info_table[cluster_col] = M clusters_info_table['name_of_clusters'] = which_cluster clusters_info_table = clusters_info_table.sort_values(cluster_col) cluster_count = np.bincount(prediction_table[cluster_col]) cluster_count = cluster_count[cluster_count != 0] clusters_info_table['num_of_entities'] = list(cluster_count) rb = BrtcReprBuilder() rb.addMD( strip_margin("""### Hierarchical Clustering Post Process Result""")) rb.addMD( strip_margin(""" |### Parameters | |{display_params} | |## Clusters Information | |{clusters_info_table} | """.format(display_params=dict2MD(model['parameters']), clusters_info_table=pandasDF2MD(clusters_info_table)))) model = _model_dict('hierarchical_clustering_post') model['clusters_info'] = clusters_info_table model['_repr_brtc_'] = rb.get() return {'out_table': prediction_table, 'model': model}
def _outlier_detection_lof(table, input_cols, n_neighbors=20, result_type='add_prediction', new_column_name='is_outlier'): out_table = table.copy() features = out_table[input_cols] lof_model = LocalOutlierFactor(n_neighbors, algorithm='auto', leaf_size=30, metric='minkowski', p=2, novelty=True, contamination=0.1) lof_model.fit(features) isinlier = lambda _: 'in' if _ == 1 else 'out' out_table[new_column_name] = [ isinlier(lof_predict) for lof_predict in lof_model.predict(features) ] if result_type == 'add_prediction': pass elif result_type == 'remove_outliers': out_table = out_table[out_table[new_column_name] == 'in'] out_table = out_table.drop(new_column_name, axis=1) elif result_type == 'both': out_table = out_table[out_table[new_column_name] == 'in'] else: raise_runtime_error("Please check 'result_type'.") params = { 'Input Columns': input_cols, 'Result Type': result_type, 'Number of Neighbors': n_neighbors, } rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Outlier Detection (Local Outlier Factor) Result | ### Parameters | | {display_params} | """.format(display_params=dict2MD(params)))) model = _model_dict('outlier_detection_lof') model['params'] = params model['lof_model'] = lof_model model['input_cols'] = input_cols model['result_type'] = result_type model['num_neighbors'] = n_neighbors model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'model': model}
def _discretize_quantile(table, input_col, num_of_buckets=2, out_col_name='bucket_number'): out_table = table.copy() out_table[out_col_name], buckets = pd.qcut(table[input_col], num_of_buckets, labels=False, retbins=True, precision=10, duplicates='drop') params = { 'input_col': input_col, 'num_of_buckets': num_of_buckets, 'out_col_name': out_col_name } cnt = Counter(out_table[out_col_name].values) # index_list, bucket_list index_list = [] bucket_list = [] cnt_list = [] for i in range(len(buckets) - 1): left = '[' if i == 0 else '(' index_list.append(i) cnt_list.append(cnt[i]) bucket_list.append("{left}{lower}, {upper}]".format( left=left, lower=buckets[i], upper=buckets[i + 1])) # 'buckets' is tuple type data. # Build model result = pd.DataFrame.from_items([['bucket number', index_list], ['buckets', bucket_list], ['count', cnt_list]]) # Build model rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Quantile-based Discretization Result | ### Result | {result} | | ### Parameters | {params} """.format(result=pandasDF2MD(result), params=dict2MD(params)))) model = _model_dict('discretize_quantile') model['result'] = result model['params'] = params model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'model': model}
def _scale(table, input_cols, scaler, suffix=None): if scaler == 'RobustScaler': if suffix is None: suffix = '_robust' scale = RobustScaler() elif scaler == 'StandardScaler': if suffix is None: suffix = '_standard' scale = StandardScaler() elif scaler == 'MaxAbsScaler': if suffix is None: suffix = '_max_abs' scale = MaxAbsScaler() else: # minmax if suffix is None: suffix = '_min_max' scale = MinMaxScaler() scaled_cols = [] for col in input_cols: scaled_cols.append(col + suffix) out_table = table.copy() scaled_table = scale.fit_transform(out_table[input_cols]) out_table[scaled_cols] = pd.DataFrame(data=scaled_table) out_model = _model_dict('scaler') out_model['input_cols'] = input_cols out_model['used_scaler'] = scaler out_model['scaler'] = scale out_model['suffix'] = suffix rb = BrtcReprBuilder() params = { "Input columns": input_cols, "Normalization method": scaler, "Suffix": suffix } summary_table = pd.DataFrame() summary_table['Input columns'] = input_cols summary_table['Normalization method'] = [scaler] * len(input_cols) summary_table['New column names'] = scaled_cols rb.addMD( strip_margin(""" | ## Label Encoder Model | ### Parameters | {params} | | ### Summary table | {summary_table} """.format(params=dict2MD(params), summary_table=pandasDF2MD(summary_table)))) out_model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'model': out_model}
def _one_sample_ttest_repr(statistics, result_dict, params): input_cols = params['input_cols'] alternatives = params['alternatives'] hypothesized_mean = params['hypothesized_mean'] conf_level = params['conf_level'] rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## One Sample T Test Result | - Statistics = {s} | - Hypothesized mean = {h} | - Confidence level = {cl} """.format(s=statistics, h=hypothesized_mean, cl=conf_level))) for input_col in input_cols: H1_list = [] p_list = [] CI_list = [] for alter in alternatives: test_info = result_dict[input_col][alter] H1_list.append(test_info['alternative_hypothesis']) p_list.append(test_info['p_value']) CI_list.append(test_info['confidence_interval']) result_table = pd.DataFrame.from_items( [['alternative hypothesis', H1_list], ['p-value', p_list], ['%g%% confidence Interval' % (conf_level * 100), CI_list]]) rb.addMD( strip_margin(""" | ### Data = {input_col} | - t-value = {t_value} | | {result_table} """.format(input_col=input_col, t_value=result_dict[input_col]['t_value'], result_table=pandasDF2MD(result_table)))) rb.addMD( strip_margin(""" | ### Parameters | {params} """.format(params=dict2MD(params)))) return rb
def _label_encoder(table, input_col, new_column_name='encoded_column'): out_table = table.copy() le = LabelEncoder().fit(table[input_col]) out_model = _model_dict('label_encoder') out_model['label_encoder'] = le out_model['input_col'] = input_col out_model['classes'] = le.classes_ rb = BrtcReprBuilder() params = { 'Input Column': input_col, "No. distinct classes": len(le.classes_), "New column name": new_column_name } rb.addMD( strip_margin(""" | ## Label Encoder Model | ### Parameters | {params} | """.format(params=dict2MD(params)))) out_model['_repr_brtc_'] = rb.get() out_table[new_column_name] = le.transform(table[input_col]) return {'out_table': out_table, 'model': out_model}
def _lda(table, input_col, num_voca=1000, num_topic=3, num_topic_word=3, max_iter=20, learning_method='online', learning_offset=10., random_state=None): corpus = table[input_col] tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_voca, stop_words='english') term_count = tf_vectorizer.fit_transform(corpus) tf_feature_names = tf_vectorizer.get_feature_names() if learning_method == 'online': lda_model = LatentDirichletAllocation( n_components=num_topic, max_iter=max_iter, learning_method=learning_method, learning_offset=learning_offset, random_state=random_state).fit(term_count) elif learning_method == 'batch': lda_model = LatentDirichletAllocation( n_components=num_topic, max_iter=max_iter, learning_method=learning_method, random_state=random_state).fit(term_count) else: raise_runtime_error("Please check 'learning_method'.") topic_model = pd.DataFrame([]) topic_idx_list = [] voca_weights_list = [] for topic_idx, weights in enumerate(lda_model.components_): topic_idx_list.append("Topic {}".format(topic_idx)) pairs = [] for term_idx, value in enumerate(weights): pairs.append((abs(value), tf_feature_names[term_idx])) pairs.sort(key=lambda x: x[0], reverse=True) voca_weights = [] for pair in pairs[:num_topic_word]: voca_weights.append("{}: {}".format(pair[1], pair[0])) voca_weights_list.append(voca_weights) topic_model['topic idx'] = topic_idx_list topic_model['topic vocabularies'] = voca_weights_list doc_topic = lda_model.transform(term_count) doc_classification = pd.DataFrame() doc_classification['documents'] = [doc for doc in corpus] doc_classification['top topic'] = [ "Topic {}".format(doc_topic[i].argmax()) for i in range(len(corpus)) ] params = { 'Input Column': input_col, 'Number of Vocabularies': num_voca, 'Number of Topics': num_topic, 'Number of Terminologies': num_topic_word, 'Iterations': max_iter, 'Learning Method': learning_method, } rb = BrtcReprBuilder() rb.addMD(strip_margin("""# Latent Dirichlet Allocation Result""")) rb.addMD( strip_margin(""" | |### Parameters | | {display_params} | |### Topic Model | |{topic_model} | |### Documents Classification | |{doc_classification} | """.format(display_params=dict2MD(params), topic_model=pandasDF2MD(topic_model, num_rows=num_topic + 1), doc_classification=pandasDF2MD(doc_classification, num_rows=len(corpus) + 1)))) model = _model_dict('lda') model['parameter'] = params model['topic_model'] = topic_model model['documents_classification'] = doc_classification model['_repr_brtc_'] = rb.get() return {'model': model}
def _collaborative_filtering_train(table, user_col , item_col, rating_col, N=10, filter=True, k=5, based='item', mode='train', method='cosine', weighted=True, centered=True, targets=None, normalize=True, workers=1, filter_minus=False, maintain_already_scored=True): if based == 'item': normalize = False table_user_col = table[user_col] table_item_col = table[item_col] rating_col = table[rating_col] user_encoder = preprocessing.LabelEncoder() item_encoder = preprocessing.LabelEncoder() user_encoder.fit(table_user_col) item_encoder.fit(table_item_col) user_correspond = user_encoder.transform(table_user_col) item_correspond = item_encoder.transform(table_item_col) if based == 'item': item_users = csr_matrix((rating_col, (item_correspond, user_correspond))) check_cen = csr_matrix((rating_col + 1, (item_correspond, user_correspond))) else: item_users = csr_matrix((rating_col, (user_correspond, item_correspond))) check_cen = csr_matrix((rating_col + 1, (user_correspond, item_correspond))) centered_ratings = item_users.copy() num_item, num_user = item_users.shape if centered: update_item = [] update_user = [] update_rating = [] for item in range(num_item): index = 0 sum = 0 for user, rating in _nonzeros(check_cen, item): index += 1 sum += rating avg = sum / index - 1 for user, rating in _nonzeros(check_cen, item): update_item.append(item) update_user.append(user) update_rating.append(avg) centered_ratings -= csr_matrix((update_rating, (update_item, update_user))) if (method == 'adjusted' or normalize) and based == 'item': check_cen = check_cen.transpose().tocsr() if based == 'user': tmp = num_user num_user = num_item num_item = tmp user_avg = [] if normalize: for user in range(num_user): index = 0 sum = 0 for user, rating in _nonzeros(check_cen, user): index += 1 sum += rating avg = sum / index user_avg.append(avg) if method == 'adjusted': update_item = [] update_user = [] update_rating = [] for user in range(num_user): sum = 0 for item, rating in _nonzeros(check_cen, user): sum += rating avg = sum / num_item for item in range(num_item): update_item.append(item) update_user.append(user) update_rating.append(avg) if based == 'item': centered_ratings -= csr_matrix((update_rating, (update_item, update_user))) else: centered_ratings -= csr_matrix((update_rating, (update_user, update_item))) method = 'cosine' if based == 'user': tmp = num_user num_user = num_item num_item = tmp if method == 'cosine': similar_coeff = cosine_similarity(centered_ratings) elif method == 'pearson': result = [] for i in centered_ratings.toarray(): result.append(i - np.average(i)) similar_coeff = cosine_similarity(result) elif method == 'jaccard': similar_coeff = 1 - pairwise_distances(centered_ratings.toarray(), metric="hamming") if based == 'user': item_users = item_users.transpose().tocsr() if mode == 'Topn': if targets is None: targets = user_encoder.classes_ if table_user_col.dtype in (np.floating, float, np.int, int, np.int64): targets = [float(i) for i in targets] targets_en = user_encoder.transform(targets) Topn_result = [] if workers == 1: for user in targets_en: recommendations_corre = _recommend(user, item_users, similar_coeff, N, k, method, weighted, centered, based, normalize, user_avg, filter, filter_minus, maintain_already_scored) recommendations = [] for (item, rating) in recommendations_corre: recommendations += [item_encoder.inverse_transform([item])[0], rating] Topn_result += [recommendations] else: Topn_result_tmp = apply_by_multiprocessing_list_to_list(targets_en, _recommend_multi, item_users=item_users, similar_coeff=similar_coeff, N=N, k=k, method=method, weighted=weighted, centered=centered, based=based, normalize=normalize, user_avg=user_avg, item_encoder=item_encoder, workers=workers, filter_minus=filter_minus, maintain_already_scored=maintain_already_scored) Topn_result = [] for i in range(workers): Topn_result += Topn_result_tmp[i] Topn_result = pd.DataFrame(Topn_result) Topn_result = pd.concat([pd.DataFrame(targets), Topn_result], axis=1, ignore_index=True) column_names = ['user_name'] for i in range(int((Topn_result.shape[1] - 1) / 2)): column_names += ['item_top%d' % (i + 1), 'rating_top%d' % (i + 1)] Topn_result.columns = column_names return {'out_table' : Topn_result} parameters = dict() parameters['Number of Neighbors'] = k parameters['Based'] = based if method == 'cosine': parameters['Similarity method'] = 'Cosine' elif method == 'jaccard': parameters['Similarity method'] = 'Jaccard' elif method == 'pearson': parameters['Similarity method'] = 'Pearson' else: parameters['Similarity method'] = 'Adjusted Cosine' parameters['Use Centered Mean'] = centered parameters['Use Weighted Rating'] = weighted rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Collaborative Filtering Result | | ### Parameters | {parameters} | """.format(parameters=dict2MD(parameters)))) model = _model_dict('collaborative filtering') model['weighted'] = weighted model['k'] = k model['similar_coeff'] = similar_coeff model['item_encoder'] = item_encoder model['user_encoder'] = user_encoder model['item_users'] = item_users model['user_col'] = user_col model['item_col'] = item_col model['based'] = based model['_repr_brtc_'] = rb.get() model['normalize'] = normalize model['user_avg'] = user_avg return{'model' : model}
def _outlier_detection_tukey_carling(table, input_cols, outlier_method='tukey', multiplier=None, number_of_removal=1, result_type='add_prediction', new_column_prefix='is_outlier_'): out_table = table.copy() median = out_table.median() q1s = out_table.quantile(0.25) q3s = out_table.quantile(0.75) iqrs = q3s - q1s output_col_names = [] if outlier_method == 'tukey': if multiplier is None: multiplier = 1.5 for col in input_cols: output_col_name = '{prefix}{col}'.format(prefix=new_column_prefix, col=col) output_col_names.append(output_col_name) out_table[output_col_name] = out_table[col].apply( lambda _: _tukey(_, q1s[col], q3s[col], iqrs[col], multiplier)) elif outlier_method == 'carling': if multiplier is None: multiplier = 2.3 for col in input_cols: output_col_name = '{prefix}{col}'.format(prefix=new_column_prefix, col=col) output_col_names.append(output_col_name) out_table[output_col_name] = out_table[col].apply( lambda _: _carling(_, median[col], iqrs[col], multiplier)) else: raise_runtime_error("Please check 'outlier_method'.") # result_type is one of 'add_prediction', 'remove_outliers', 'both' if result_type == 'add_prediction': pass elif result_type == 'remove_outliers': prediction = out_table[output_col_names].apply( lambda row: np.sum(row == 'out') < number_of_removal, axis=1) out_table = out_table[prediction.values] out_table = out_table.drop(output_col_names, axis=1) elif result_type == 'both': prediction = out_table[output_col_names].apply( lambda row: np.sum(row == 'out') < number_of_removal, axis=1) out_table = out_table[prediction.values] else: raise_runtime_error("Please check 'result_type'.") params = { 'Input Columns': input_cols, 'Outlier Method': outlier_method, 'Multiplier': multiplier, 'Number of Outliers in a Row': number_of_removal, 'Result Type': result_type, 'New Column Prefix': new_column_prefix } rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Outlier Detection (Tukey/Carling) Result | ### Parameters | | {display_params} | """.format(display_params=dict2MD(params)))) model = _model_dict('outlier_detection_tukey_carling') model['params'] = params model['input_cols'] = input_cols model['outlier_method'] = outlier_method model['multiplier'] = multiplier model['number_of_removal'] = number_of_removal model['result_type'] = result_type model['median'] = median model['q1'] = q1s model['q3'] = q3s model['iqr'] = iqrs model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'model': model}
def _naive_bayes_train(table, feature_cols, label_col, alpha=1.0, fit_prior=True, class_prior=None): features = table[feature_cols] label = table[label_col] label_encoder = preprocessing.LabelEncoder() label_encoder.fit(label) label_correspond = label_encoder.transform(label) if class_prior is not None: tmp_class_prior = [0] * len(class_prior) for elems in class_prior: tmp = elems.split(":") tmp_class_prior[label_encoder.transform([tmp[0]])[0]] = float(tmp[1]) class_prior = tmp_class_prior nb_model = MultinomialNB(alpha, fit_prior, class_prior) nb_model.fit(features, label_correspond) class_log_prior = nb_model.class_log_prior_ feature_log_prob_ = nb_model.feature_log_prob_ tmp_result = np.hstack((list(map(list, zip(*[label_encoder.classes_] + [class_log_prior]))), (feature_log_prob_))) column_names = ['labels', 'pi'] for feature_col in feature_cols: column_names += ['theta_' + feature_col] result_table = pd.DataFrame.from_records(tmp_result, columns=column_names) prediction_correspond = nb_model.predict(features) get_param = dict() get_param['Lambda'] = alpha # get_param['Prior Probabilities of the Classes'] = class_prior get_param['Fit Class Prior Probability'] = fit_prior get_param['Feature Columns'] = feature_cols get_param['Label Column'] = label_col cnf_matrix = confusion_matrix(label_correspond, prediction_correspond) plt.figure() _plot_confusion_matrix(cnf_matrix, classes=label_encoder.classes_, title='Confusion Matrix') fig_confusion_matrix = plt2MD(plt) accuracy = nb_model.score(features, label_correspond) * 100 rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Naive Bayes Classification Result | | ### Model:Multinomial | {result_table} | ### Parameters | {table_parameter} | ### Predicted vs Actual | {image1} | #### Accuacy = {accuracy}% | """.format(image1=fig_confusion_matrix, accuracy=accuracy, result_table=pandasDF2MD(result_table), table_parameter=dict2MD(get_param)))) model = _model_dict('naive_bayes_model') model['features'] = feature_cols model['label_col'] = label_col model['label_encoder'] = label_encoder model['nb_model'] = nb_model model['_repr_brtc_'] = rb.get() return {'model' : model}
def _hierarchical_clustering(table, input_cols, input_mode='original', key_col=None, link='complete', met='euclidean', num_rows=20, figure_height=6.4, orient='right'): out_table = table.copy() features = out_table[input_cols] if input_mode == 'original': len_features = len(features) if key_col != None: data_names = list(out_table[key_col]) elif key_col == None: data_names = ['pt_' + str(i) for i in range(len_features)] out_table['name'] = data_names Z = linkage(ssd.pdist(features, metric=met), method=link, metric=met) elif input_mode == 'matrix': len_features = len(input_cols) if key_col != None: data_names = [] for column in input_cols: data_names.append( out_table[key_col][out_table.columns.get_loc(column)]) elif key_col == None: data_names = [] for column in input_cols: data_names.append( out_table.columns[out_table.columns.get_loc(column)]) col_index = [] for column in input_cols: col_index.append(out_table.columns.get_loc(column)) dist_matrix = features.iloc[col_index] Z = linkage(ssd.squareform(dist_matrix), method=link, metric=met) dist_matrix['name'] = data_names else: raise_runtime_error("Please check 'input_mode'.") range_len_Z = range(len(Z)) linkage_matrix = pd.DataFrame([]) linkage_matrix['linkage step'] = [ '%g' % (x + 1) for x in reversed(range_len_Z) ] linkage_matrix['name of clusters'] = [ 'CL_%g' % (i + 1) for i in reversed(range_len_Z) ] joined_column1 = [] for i in range_len_Z: if Z[:, 0][i] < len_features: joined_column1.append(data_names[int(Z[:, 0][i])]) elif Z[:, 0][i] >= len_features: joined_column1.append( linkage_matrix['name of clusters'][Z[:, 0][i] - len_features]) linkage_matrix['joined column1'] = joined_column1 joined_column2 = [] for i in range_len_Z: if Z[:, 1][i] < len_features: joined_column2.append(data_names[int(Z[:, 1][i])]) elif Z[:, 1][i] >= len_features: joined_column2.append( linkage_matrix['name of clusters'][Z[:, 1][i] - len_features]) linkage_matrix['joined column2'] = joined_column2 linkage_matrix['distance'] = [distance for distance in Z[:, 2]] linkage_matrix['number of original'] = [ int(entities) for entities in Z[:, 3] ] linkage_matrix = linkage_matrix.reindex( index=linkage_matrix.index[::-1])[0:] # calculate full dendrogram plt.figure(figsize=(8.4, figure_height)) dendrogram(Z, truncate_mode='none', get_leaves=True, orientation=orient, labels=data_names, leaf_rotation=45, leaf_font_size=10., show_contracted=False) plt.title('Hierarchical Clustering Dendrogram') if orient == 'top': plt.xlabel('Samples') plt.ylabel('Distance') elif orient == 'right': plt.xlabel('Distance') plt.ylabel('Samples') plt.tight_layout() plt2 = plt2MD(plt) plt.clf() params = { 'Input Columns': input_cols, 'Input Mode': input_mode, 'Linkage Method': link, 'Metric': met, 'Number of Rows in Linkage Matrix': num_rows } rb = BrtcReprBuilder() rb.addMD(strip_margin("""# Hierarchical Clustering Result""")) rb.addMD( strip_margin(""" |### Dendrogram | |{image} | |### Parameters | |{display_params} | |### Linkage Matrix | |{out_table1} | """.format(image=plt2, display_params=dict2MD(params), out_table1=pandasDF2MD(linkage_matrix.head(num_rows), num_rows=num_rows + 1)))) model = _model_dict('hierarchical_clustering') model['model'] = Z model['input_mode'] = input_mode model['table'] = out_table if input_mode == 'matrix': model['dist_matrix'] = dist_matrix model['parameters'] = params model['linkage_matrix'] = linkage_matrix model['_repr_brtc_'] = rb.get() return {'model': model}
def _one_hot_encoder2(table, input_cols, prefix='list', prefix_list=None, suffix='index', n_values='auto', categorical_features='all', sparse=True, handle_unknown='error', drop_last=False): out_table = table.copy() sparse = False enc_list = [] le_list = [] if drop_last: new_col_names_list_with_true_drop_last = [] new_col_names_list = [] prefix_list_index = 0 if prefix == 'list': len_prefix_list = 0 if prefix_list is None else len(prefix_list) if len(input_cols) != len_prefix_list: # TODO: make the error message code raise_runtime_error( 'The number of Input Columns and the number of Prefixes should be equal.' ) number_distinct_classes = [] for col_name in input_cols: enc = OneHotEncoder(n_values=n_values, categorical_features=categorical_features, sparse=sparse, handle_unknown=handle_unknown) le = LabelEncoder() distinct_classes = np.unique(out_table[col_name].values) number_distinct_classes.append(len(distinct_classes)) new_col_names = [] if suffix == 'index': if prefix == 'list': for i in range(0, len(distinct_classes)): new_col_names.append(prefix_list[prefix_list_index] + '_' + str(i)) else: for i in range(0, len(distinct_classes)): new_col_names.append(col_name + '_' + str(i)) else: pattern = re.compile("\W") for i in distinct_classes: i = re.sub(pattern, "_", str(i)) if prefix == 'list': new_col_names.append(prefix_list[prefix_list_index] + '_' + i) else: new_col_names.append(col_name + '_' + i) transformed_table = pd.DataFrame(enc.fit_transform( le.fit_transform(out_table[col_name]).reshape(-1, 1)), columns=new_col_names) new_col_names_list.append(new_col_names) if drop_last: new_col_names = new_col_names[:-1] new_col_names_list_with_true_drop_last.append(new_col_names) for new_col_name in new_col_names: out_table[new_col_name] = transformed_table[new_col_name] enc_list.append(enc) le_list.append(le) prefix_list_index = prefix_list_index + 1 rb = BrtcReprBuilder() params = { 'Input Columns': input_cols, "Prefix Type": prefix, "Suffix Type": suffix, "Drop Last": drop_last, "Number of values per feature": n_values, "Categorical features": categorical_features, "Error handling": handle_unknown } summary_table = pd.DataFrame() summary_table['Input Columns'] = input_cols summary_table['No. distinct classes'] = number_distinct_classes if drop_last: summary_table[ 'New encoded columns'] = new_col_names_list_with_true_drop_last else: summary_table['New encoded columns'] = new_col_names_list rb.addMD( strip_margin(""" | ## One Hot Encoder Model | ### Parameters | {params} | | ### Summary | {summary_table} """.format(params=dict2MD(params), summary_table=pandasDF2MD(summary_table)))) out_model = _model_dict('one_hot_encoder') out_model['one_hot_encoder_list'] = enc_list out_model['label_encoder_list'] = le_list out_model['input_cols'] = input_cols out_model['prefix'] = prefix out_model['prefix_list'] = prefix_list out_model['suffix'] = suffix out_model['drop_last'] = drop_last out_model['_repr_brtc_'] = rb.get() if drop_last: out_model[ 'new_col_names_list_with_true_drop_last'] = new_col_names_list_with_true_drop_last out_model['new_col_names_list'] = new_col_names_list return {'out_table': out_table, 'model': out_model}
def _decision_tree_classification_train( table, feature_cols, label_col, # fig_size=np.array([6.4, 4.8]), criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=False, sample_weight=None, check_input=True, X_idx_sorted=None): y_train = table[label_col] if (sklearn_utils.multiclass.type_of_target(y_train) == 'continuous'): raise_error('0718', 'label_col') classifier = DecisionTreeClassifier( criterion, splitter, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, random_state, max_leaf_nodes, min_impurity_decrease, min_impurity_split, class_weight, presort) classifier.fit(table[feature_cols], table[label_col], sample_weight, check_input, X_idx_sorted) try: from sklearn.externals.six import StringIO from sklearn.tree import export_graphviz import pydotplus dot_data = StringIO() export_graphviz(classifier, out_file=dot_data, feature_names=feature_cols, class_names=table[label_col].astype('str').unique(), filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) from brightics.common.repr import png2MD fig_tree = png2MD(graph.create_png()) except: fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer." # json model = _model_dict('decision_tree_classification_model') model['feature_cols'] = feature_cols model['label_col'] = label_col model['classes'] = classifier.classes_ feature_importance = classifier.feature_importances_ model['feature_importance'] = feature_importance model['max_features'] = classifier.max_features_ model['n_classes'] = classifier.n_classes_ model['n_features'] = classifier.n_features_ model['n_outputs'] = classifier.n_outputs_ model['tree'] = classifier.tree_ get_param = classifier.get_params() model['parameters'] = get_param model['classifier'] = classifier # report indices = np.argsort(feature_importance) sorted_feature_cols = np.array(feature_cols)[indices] plt.title('Feature Importances') plt.barh(range(len(indices)), feature_importance[indices], color='b', align='center') for i, v in enumerate(feature_importance[indices]): plt.text(v, i, " {:.2f}".format(v), color='b', va='center', fontweight='bold') plt.yticks(range(len(indices)), sorted_feature_cols) plt.xlabel('Relative Importance') plt.xlim(0, 1.1) plt.tight_layout() fig_feature_importances = plt2MD(plt) plt.clf() params = dict2MD(get_param) # Add tree plot rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Decision Tree Classification Train Result | ### Decision Tree | {fig_tree} | | ### Feature Importance | {fig_feature_importances} | | ### Parameters | {list_parameters} | """.format(fig_tree=fig_tree, fig_feature_importances=fig_feature_importances, list_parameters=params))) model['_repr_brtc_'] = rb.get() return {'model': model}
def _kmeans_train_predict(table, input_cols, n_clusters=3, prediction_col='prediction', init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto', seed=None, n_jobs=1, algorithm='auto', n_samples=None): feature_names, inputarr = check_col_type(table, input_cols) if n_samples is None: n_samples = len(inputarr) k_means = SKKMeans(n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter, tol=tol, precompute_distances=precompute_distances, verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm) k_means.fit(inputarr) params = {'input_cols':feature_names, 'n_clusters':n_clusters, 'init':init, 'n_init':n_init, 'max_iter':max_iter, 'tol':tol, 'precompute_distances':precompute_distances, 'seed':seed, 'n_jobs':n_jobs, 'algorithm':algorithm, 'n_samples':n_samples} cluster_centers = k_means.cluster_centers_ n_clusters = len(cluster_centers) colors = cm.nipy_spectral(np.arange(n_clusters).astype(float) / n_clusters) labels = k_means.labels_ pca2_model = PCA(n_components=min(2, len(feature_names))).fit(inputarr) pca2 = pca2_model.transform(inputarr) fig_centers = _kmeans_centers_plot(feature_names, cluster_centers, colors) fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, cluster_centers, seed, colors) fig_pca = _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2, colors) rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Kmeans Result | - Number of iterations run: {n_iter_}. | - Sum of square error: {sse_}. | - Coordinates of cluster centers | {fig_cluster_centers} | - Samples | {fig_pca} | {fig_samples} | | ### Parameters | {params} """.format(n_iter_=k_means.n_iter_, sse_=k_means.inertia_, fig_cluster_centers=fig_centers, fig_pca=fig_pca, fig_samples=fig_samples, params=dict2MD(params)))) model = _model_dict('kmeans') model['model'] = k_means model['input_cols'] = input_cols model['_repr_brtc_'] = rb.get() out_table = table.copy() out_table[prediction_col] = labels return {'out_table':out_table, 'model':model}
def _collaborative_filtering_train(table, user_col, item_col, rating_col, N=10, k=5, based='item', mode='train', method='cosine', weighted=True, centered=True, targets=None, normalize=True): if based == 'item': normalize = False table_user_col = table[user_col] table_item_col = table[item_col] rating_col = table[rating_col] user_encoder = preprocessing.LabelEncoder() item_encoder = preprocessing.LabelEncoder() user_encoder.fit(table_user_col) item_encoder.fit(table_item_col) user_correspond = user_encoder.transform(table_user_col) item_correspond = item_encoder.transform(table_item_col) item_users = np.zeros( (len(item_encoder.classes_), len(user_encoder.classes_))) for i in range(len(table_user_col)): item_users[item_correspond[i]][user_correspond[i]] = rating_col[i] + 1 centered_ratings = item_users.copy() num_item, num_user = item_users.shape if centered and based == 'item': check_cen = csr_matrix(centered_ratings) for item in range(num_item): index = 0 sum = 0 for user, rating in _nonzeros(check_cen, item): index += 1 sum += rating avg = sum / index for user, rating in _nonzeros(check_cen, item): centered_ratings[item][user] -= avg if centered and based == 'user': check_cen = csr_matrix(np.transpose(centered_ratings)) for user in range(num_user): index = 0 sum = 0 for item, rating in _nonzeros(check_cen, user): index += 1 sum += rating avg = sum / index for item, rating in _nonzeros(check_cen, user): centered_ratings[item][user] -= avg for i in range(len(table_user_col)): item_users[item_correspond[i]][user_correspond[i]] -= 1 if method == 'adjusted' or normalize: check_cen = csr_matrix(np.transpose(item_users)) user_avg = [] if normalize: for user in range(num_user): index = 0 sum = 0 for user, rating in _nonzeros(check_cen, user): index += 1 sum += rating avg = sum / index user_avg.append(avg) if method == 'adjusted': for user in range(num_user): sum = 0 for item, rating in _nonzeros(check_cen, user): sum += rating avg = sum / num_item for item in range(num_item): centered_ratings[item][user] -= avg method = 'cosine' if based == 'item': similar_coeff = np.zeros((num_item, num_item)) for item in range(num_item): similar_coeff[item][item] = -1 for diff_item in range(item + 1, num_item): similar_coeff[item][diff_item] = _similar_coeff( centered_ratings, item, diff_item, method) similar_coeff[diff_item][item] = similar_coeff[item][diff_item] else: similar_coeff = np.zeros((num_user, num_user)) for user in range(num_user): similar_coeff[user][user] = -1 for diff_user in range(user + 1, num_user): similar_coeff[user][diff_user] = _similar_coeff( np.transpose(centered_ratings), user, diff_user, method) similar_coeff[diff_user][user] = similar_coeff[user][diff_user] if mode == 'Topn': if targets is None: targets = user_encoder.classes_ targets_en = user_encoder.transform(targets) Topn_result = [] for user in targets_en: recommendations_corre = _recommend(user, item_users, similar_coeff, N, k, method, weighted, centered, based, normalize, user_avg) recommendations = [] for (item, rating) in recommendations_corre: recommendations += [ item_encoder.inverse_transform([item])[0], rating ] Topn_result += [recommendations] Topn_result = pd.DataFrame(Topn_result) Topn_result = pd.concat([pd.DataFrame(targets), Topn_result], axis=1, ignore_index=True) column_names = ['user_name'] for i in range(int((Topn_result.shape[1] - 1) / 2)): column_names += ['item_top%d' % (i + 1), 'rating_top%d' % (i + 1)] Topn_result.columns = column_names return {'out_table': Topn_result} parameters = dict() parameters['Number of Neighbors'] = k parameters['Based'] = based if method == 'cosine': parameters['Similarity method'] = 'Cosine' elif method == 'jaccard': parameters['Similarity method'] = 'Jaccard' elif method == 'pearson': parameters['Similarity method'] = 'Pearson' else: parameters['Similarity method'] = 'Adjusted Cosine' parameters['Use Centered Mean'] = centered parameters['Use Weighted Rating'] = weighted rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Collaborative Filtering Result | | ### Parameters | {parameters} | """.format(parameters=dict2MD(parameters)))) model = _model_dict('collaborative filtering') model['weighted'] = weighted model['k'] = k model['similar_coeff'] = similar_coeff model['item_encoder'] = item_encoder model['user_encoder'] = user_encoder model['item_users'] = item_users model['user_col'] = user_col model['item_col'] = item_col model['based'] = based model['_repr_brtc_'] = rb.get() model['normalize'] = normalize model['user_avg'] = user_avg return {'model': model}
def _dtm(table, input_col, topic_name='topic', num_topic=5, num_topic_word=10, max_iter=20, time_slice=None, coherence='u_mass', vis_time=0, seed=None): running_os = platform.system() is_os_64bit = platform.machine().endswith('64') if running_os == 'Linux': if is_os_64bit: dtm_filename = 'dtm-linux64' else: dtm_filename = 'dtm-linux32' elif running_os == 'Windows': if is_os_64bit: dtm_filename = 'dtm-win64.exe' else: dtm_filename = 'dtm-win32.exe' else: # Mac dtm_filename = 'dtm-darwin64' dtm_path = os.path.join(str(pathlib.Path(__file__).parent.absolute()), 'dtm', dtm_filename) if running_os != 'Windows': bash_command = "chmod +x {}".format(dtm_path) os.system(bash_command) tokenized_doc = np.array(table[input_col]) num_doc = len(tokenized_doc) if time_slice is None: time_slice = [num_doc] elif sum(time_slice) != num_doc: raise_runtime_error("The sum of time slice list does not match the number of documents.") if vis_time < 0 or vis_time >= len(time_slice): raise_runtime_error("Invalid time parameter: {}".format(vis_time)) dictionary = corpora.Dictionary(tokenized_doc) corpus = [dictionary.doc2bow(text) for text in tokenized_doc] dtm_params = {"corpus": corpus, "id2word": dictionary, "time_slices": time_slice, "num_topics": num_topic, "lda_sequence_max_iter": max_iter, "model": 'dtm'} if seed is not None: dtm_params["rng_seed"] = seed dtm_model = DtmModel(dtm_path, **dtm_params) topic_time = [[dtm_model.show_topic(topicid=id, time=t, topn=num_topic_word) for id in range(num_topic)] for t in range(len(time_slice))] topic_time = [[["{}: {}".format(tup[1], tup[0]) for tup in topic] for topic in time] for time in topic_time] timeline = ["{} ({} docs)".format(ind, t) for ind, t in enumerate(time_slice)] columns = ["topic_{}".format(i + 1) for i in range(num_topic)] topic_table = pd.DataFrame(topic_time, columns=columns) topic_table['time'] = timeline topic_table = topic_table[['time'] + columns] prop_arr = dtm_model.gamma_ out_table = pd.DataFrame.copy(table, deep=True) if topic_name in table.columns: raise BrighticsFunctionException.from_errors( [{'0100': "Existing table contains Topic Column Name. Please choose again."}]) out_table[topic_name] = [item.argmax() + 1 for item in prop_arr] out_table['topic_distribution'] = prop_arr.tolist() coherence_topic_arr = [dtm_model.dtm_coherence(time) for time in range(len(time_slice))] if coherence == 'u_mass': coh_arr = [CoherenceModel(topics=item, dictionary=dictionary, corpus=corpus, coherence='u_mass').get_coherence() for item in coherence_topic_arr] else: coh_arr = [CoherenceModel(topics=item, dictionary=dictionary, corpus=corpus, texts=tokenized_doc, coherence='c_v').get_coherence() for item in coherence_topic_arr] doc_topic, topic_term, doc_lengths, term_frequency, vocab = dtm_model.dtm_vis(corpus, vis_time) prepared_data = plv.prepare(topic_term, doc_topic, doc_lengths, vocab, term_frequency, sort_topics=False) html_result = plv.prepared_data_to_html(prepared_data) params = {'Input column': input_col, 'Topic column name': topic_name, 'Number of topics': num_topic, 'Number of words for each topic': num_topic_word, 'Maximum number of iterations': max_iter, 'Time slice': time_slice, 'Coherence measure': coherence, 'Time to visualize': vis_time} rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Dynamic Topic Modeling Result | ### Summary | """)) rb.addHTML(html_result) rb.addMD(strip_margin(""" | ### Coherence for each period | {coh_arr} | | ### Parameters | {params} """.format(coh_arr=coh_arr, params=dict2MD(params)))) model = _model_dict('dtm_model') model['params'] = params model['dtm_model'] = dtm_model model['coherences'] = coh_arr model['corpus'] = corpus model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
def _lda4(table, input_col, topic_name='topic', num_voca=1000, num_topic=5, num_topic_word=10, max_iter=20, learning_method='online', learning_offset=10., random_state=None): # generate model corpus = np.array(table[input_col]) if isinstance(corpus[0], np.ndarray): tf_vectorizer = CountVectorizer(preprocessor=' '.join, stop_words='english', max_df=0.95, min_df=2, max_features=num_voca) else: tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_voca, stop_words='english') term_count = tf_vectorizer.fit_transform(corpus) tf_feature_names = tf_vectorizer.get_feature_names() if learning_method == 'online': lda_model = LatentDirichletAllocation( n_components=num_topic, max_iter=max_iter, learning_method=learning_method, learning_offset=learning_offset, random_state=random_state).fit(term_count) elif learning_method == 'batch': lda_model = LatentDirichletAllocation( n_components=num_topic, max_iter=max_iter, learning_method=learning_method, random_state=random_state).fit(term_count) else: raise_runtime_error("Please check 'learning_method'.") log_likelihood = lda_model.score(term_count) perplexity = lda_model.perplexity(term_count) # create topic table vocab_weights_list = [] vocab_list = [] weights_list = [] topic_term_prob = normalize(lda_model.components_, norm='l1') for vector in topic_term_prob: pairs = [] for term_idx, value in enumerate(vector): pairs.append((abs(value), tf_feature_names[term_idx])) pairs.sort(key=lambda x: x[0], reverse=True) vocab_weights = [] vocab = [] weights = [] for pair in pairs[:num_topic_word]: vocab_weights.append("{}: {}".format(pair[1], pair[0])) vocab.append(pair[1]) weights.append(pair[0]) vocab_weights_list.append(vocab_weights) vocab_list.append(vocab) weights_list.append(weights) topic_table = pd.DataFrame({ 'vocabularies_weights': vocab_weights_list, 'vocabularies': vocab_list, 'weights': weights_list }) topic_table['index'] = [idx + 1 for idx in topic_table.index] topic_table = topic_table[[ 'index', 'vocabularies_weights', 'vocabularies', 'weights' ]] # create output table doc_topic = lda_model.transform(term_count) out_table = pd.DataFrame.copy(table, deep=True) topic_dist_name = topic_name + '_distribution' if topic_name in table.columns or topic_dist_name in table.columns: raise BrighticsFunctionException.from_errors([{ '0100': "Existing table contains Topic Column Name. Please choose again." }]) out_table[topic_name] = [ doc_topic[i].argmax() + 1 for i in range(len(corpus)) ] out_table[topic_dist_name] = doc_topic.tolist() # pyLDAvis prepared_data = ldavis.prepare(lda_model, term_count, tf_vectorizer) html_result = pyLDAvis.prepared_data_to_html(prepared_data) # generate report params = { 'Input column': input_col, 'Topic column name': topic_name, 'Number of topics': num_topic, 'Number of words for each topic': num_topic_word, 'Maximum number of iterations': max_iter, 'Learning method': learning_method, 'Learning offset': learning_offset, 'Seed': random_state } rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Latent Dirichlet Allocation Result | ### Summary | """)) rb.addHTML(html_result) rb.addMD( strip_margin(""" | | ### Log Likelihood | {log_likelihood} | | ### Perplexity | {perplexity} | | ### Parameters | {params} """.format(log_likelihood=log_likelihood, perplexity=perplexity, params=dict2MD(params)))) # create model model = _model_dict('lda_model') model['params'] = params model['lda_model'] = lda_model model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
def _pca(table, input_cols, new_column_name='projected_', n_components=None, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', seed=None, hue=None, alpha=0, key_col=None): num_feature_cols = len(input_cols) if n_components is None: n_components = num_feature_cols pca = PCA(None, copy, whiten, svd_solver, tol, iterated_power, random_state=seed) pca_model = pca.fit(table[input_cols]) column_names = [] for i in range(0, n_components): column_names.append(new_column_name + str(i)) # print(column_names) pca_result = pca_model.transform(table[input_cols]) out_df = pd.DataFrame(data=pca_result[:, :n_components], columns=[column_names]) out_df = pd.concat([table.reset_index(drop=True), out_df], axis=1) out_df.columns = table.columns.values.tolist() + column_names res_components = pca_model.components_ res_components_df = pd.DataFrame(data=res_components[:n_components], columns=[input_cols]) res_explained_variance = pca_model.explained_variance_ res_explained_variance_ratio = pca_model.explained_variance_ratio_ res_singular_values = pca_model.singular_values_ res_mean = pca_model.mean_ res_n_components = pca_model.n_components_ res_noise_variance = pca_model.noise_variance_ res_get_param = pca_model.get_params() res_get_covariance = pca_model.get_covariance() res_get_precision = pca_model.get_precision() # visualization plt.figure() if n_components == 1: sns.scatterplot(column_names[0], column_names[0], hue=hue, data=out_df) plt_two = plt2MD(plt) plt.clf() else: plt_two = _biplot( 0, 1, pc_columns=column_names, columns=input_cols, singular_values=res_singular_values, components=res_components, explained_variance_ratio=res_explained_variance_ratio, alpha=alpha, hue=hue, data=out_df, ax=plt.gca(), key_col=key_col) plt.figure() fig_scree = _screeplot(res_explained_variance, res_explained_variance_ratio, n_components) table_explained_variance = pd.DataFrame(res_explained_variance, columns=['explained_variance']) table_explained_variance[ 'explained_variance_ratio'] = res_explained_variance_ratio table_explained_variance[ 'cum_explained_variance_ratio'] = res_explained_variance_ratio.cumsum( ) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## PCA Result | ### Plot | {image1} | | ### Explained Variance | {fig_scree} | {table_explained_variance} | | ### Components | {table2} | | ### Parameters | {parameter1} """.format(image1=plt_two, fig_scree=fig_scree, table_explained_variance=pandasDF2MD(table_explained_variance), parameter1=dict2MD(res_get_param), table2=pandasDF2MD(res_components_df)))) model = _model_dict('pca') model['components'] = res_components model['explained_variance'] = res_explained_variance model['explained_variance_ratio'] = res_explained_variance_ratio model['singular_values'] = res_singular_values model['mean'] = res_mean model['n_components'] = res_n_components model['noise_variance'] = res_noise_variance model['parameters'] = res_get_param model['covariance'] = res_get_covariance model['precision'] = res_get_precision model['_repr_brtc_'] = rb.get() model['pca_model'] = pca_model model['input_cols'] = input_cols return {'out_table': out_df, 'model': model}
def _gsdmm(table, input_col, topic_name='topic', K=10, alpha=0.1, beta=0.1, max_iter=50, num_topic_words=3): docs = np.array(table[input_col]) docs_set = [set(doc) for doc in docs] docs_preprocessed = [list(doc_set) for doc_set in docs_set] vocab_set = list(set.union(*docs_set)) vocab_size = len(vocab_set) # initialize and train a GSDMM model mgp = gsdmm_rwalk.MovieGroupProcess(K=K, alpha=alpha, beta=beta, n_iters=max_iter) topics = mgp.fit(docs_preprocessed, vocab_size) # generate topic table topic_word_count = mgp.cluster_word_distribution topic_words_raw = [[ind, _count_to_ratio_raw(word_count)] for ind, word_count in enumerate(topic_word_count) if word_count] topic_words = [[item[0]] + _gen_table(item[1], num_topic_words) for item in topic_words_raw] # reset topic ids nonempty_topic_indices = [item[0] for item in topic_words] reset_topic_ind = { old_ind: (new_ind + 1) for new_ind, old_ind in enumerate(nonempty_topic_indices) } topics = [reset_topic_ind[old_ind] for old_ind in topics] topic_words = [[reset_topic_ind[old_item[0]]] + old_item[1:] for old_item in topic_words] # generate output dataframes out_table = pd.DataFrame.copy(table, deep=True) if topic_name in table.columns: raise BrighticsFunctionException.from_errors([{ '0100': "Existing table contains the topic column name. Please choose another name." }]) out_table[topic_name] = topics columns = ['index', 'vocabularies_weights', 'vocabularies', 'weights'] topic_table = pd.DataFrame(topic_words, columns=columns) topic_table['weights'] = topic_table['weights'].apply(pd.to_numeric) # pyLDAvis if len(topic_words) == 1: html_result = None else: topic_words_dicts = [item[1] for item in topic_words_raw] topic_term_dists = [[ topic_words_dict.get(word, 0) for word in vocab_set ] for topic_words_dict in topic_words_dicts] num_docs = len(topics) num_topics = len(topic_words_raw) doc_topic_dists = np.zeros((num_docs, num_topics)) for doc_id, topic_id in enumerate(topics): doc_topic_dists[doc_id][topic_id - 1] = 1.0 doc_lengths = [len(doc) for doc in docs_preprocessed] vocab_count = functools.reduce( lambda dict_1, dict_2: { word: dict_1.get(word, 0) + dict_2.get(word, 0) for word in set(dict_1).union(dict_2) }, topic_word_count) term_frequency = [vocab_count.get(word) for word in vocab_set] prepared_data = pyLDAvis.prepare(topic_term_dists, doc_topic_dists, doc_lengths, vocab_set, term_frequency) html_result = pyLDAvis.prepared_data_to_html(prepared_data) # generate report params = { 'Input column': input_col, 'Topic column name': topic_name, 'K': K, 'Alpha': alpha, 'Beta': beta, 'Maximum number of iterations': max_iter, 'Number of words for each topic': num_topic_words } rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## GSDMM Result | ### Summary | """)) if html_result is not None: rb.addHTML(html_result) rb.addMD(strip_margin(""" | """)) rb.addMD( strip_margin(""" | ### Final Number of Topics | {num_topics} | | ### Parameters | {params} """.format(num_topics=len(topic_words_raw), params=dict2MD(params)))) # create model model = _model_dict('lda_model') model['params'] = params model['gsdmm_model'] = mgp model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
def _penalized_linear_regression_train(table, feature_cols, label_col, regression_type='ridge', alpha=1.0, l1_ratio=0.5, fit_intercept=True, max_iter=1000, tol=0.0001, random_state=None): out_table = table.copy() feature_names, features = check_col_type(out_table, feature_cols) label = out_table[label_col] if regression_type == 'ridge': regression_model = Ridge(alpha=alpha, fit_intercept=fit_intercept, max_iter=None, tol=tol, solver='auto', random_state=random_state) elif regression_type == 'lasso': regression_model = Lasso(alpha=alpha, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, random_state=random_state, selection='random') elif regression_type == 'elastic_net': regression_model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, random_state=random_state, selection='random') else: raise_runtime_error("Please check 'regression_type'.") regression_model.fit(features, label) out_table1 = pd.DataFrame([]) out_table1['x_variable_name'] = [variable for variable in feature_names] out_table1['coefficient'] = regression_model.fit(features, label).coef_ intercept = pd.DataFrame( [['intercept', regression_model.fit(features, label).intercept_]], columns=['x_variable_name', 'coefficient']) if fit_intercept == True: out_table1 = out_table1.append(intercept, ignore_index=True) predict = regression_model.predict(features) residual = label - predict out_table['predict'] = predict out_table['residual'] = residual if regression_type == 'elastic_net': params = { 'Feature Columns': feature_names, 'Label Column': label_col, 'Regression Type': regression_type, 'Regularization (Penalty Weight)': alpha, 'L1 Ratio': l1_ratio, 'Fit Intercept': fit_intercept, 'Maximum Number of Iterations': max_iter, 'Tolerance': tol } else: params = { 'Feature Columns': feature_names, 'Label Column': label_col, 'Regression Type': regression_type, 'Regularization (Penalty Weight)': alpha, 'Fit Intercept': fit_intercept, 'Maxium Number of Iterations': max_iter, 'Tolerance': tol } score = { 'MSE': mean_squared_error(label, predict), 'R2': r2_score(label, predict) } plt.figure() plt.scatter(predict, label) plt.xlabel('Predicted values for ' + label_col) plt.ylabel('Actual values for ' + label_col) x = predict p1x = np.min(x) p2x = np.max(x) plt.plot([p1x, p2x], [p1x, p2x], 'r--') fig_actual_predict = plt2MD(plt) plt.clf() plt.figure() plt.scatter(predict, residual) plt.xlabel('Predicted values for ' + label_col) plt.ylabel('Residuals') plt.axhline(y=0, color='r', linestyle='--') fig_residual_1 = plt2MD(plt) plt.clf() plt.figure() sm.qqplot(residual, line='s') plt.ylabel('Residuals') fig_residual_2 = plt2MD(plt) plt.clf() plt.figure() sns.distplot(residual) plt.xlabel('Residuals') fig_residual_3 = plt2MD(plt) plt.clf() # checking the magnitude of coefficients plt.figure() predictors = feature_names coef = Series(regression_model.coef_, predictors).sort_values() coef.plot(kind='bar', title='Model Coefficients') plt.tight_layout() fig_model_coefficients = plt2MD(plt) plt.clf() rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | # Penalized Linear Regression Result | ### Selected Parameters: | {params} | | ## Results | ### Model Parameters | {out_table1} | | ### Regression Score | {score} | """.format(params=dict2MD(params), out_table1=pandasDF2MD(out_table1), score=dict2MD(score)))) rb.addMD( strip_margin(""" | | ### Predicted vs Actual | {image1} | | ### Fit Diagnostics | {image2} | {image3} | {image4} | | ### Magnitude of Coefficients | {image5} | """.format(image1=fig_actual_predict, image2=fig_residual_1, image3=fig_residual_2, image4=fig_residual_3, image5=fig_model_coefficients))) model = _model_dict('penalized_linear_regression_model') model['feature_cols'] = feature_cols model['label_col'] = label_col model['regression_type'] = regression_type model['regression_model'] = regression_model model['parameters'] = params model['model_parameters'] = out_table1 model['prediction_residual'] = out_table model['_repr_brtc_'] = rb.get() return {'model': model}
def _tfidf(table, input_col, max_df=None, min_df=1, num_voca=1000, idf_weighting_scheme='inverseDocumentFrequency', norm='l2', smooth_idf=True, sublinear_tf=False, output_type=False): corpus = np.array(table[input_col]) if max_df == None: max_df = len(corpus) tf_vectorizer = CountVectorizer(stop_words='english', max_df=max_df, min_df=min_df, max_features=num_voca) tf_vectorizer.fit(corpus) csr_matrix_tf = tf_vectorizer.transform(corpus) tfidf_vectorizer = TfidfTransformer(norm=norm, use_idf=True, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf) csr_matrix_tfidf = tfidf_vectorizer.fit_transform(csr_matrix_tf) voca_dict = sorted(tf_vectorizer.vocabulary_.items(), key=itemgetter(1)) len_voca = len(voca_dict) # tf-idf table tfidf_table = pd.DataFrame() document_list = [] docID_list = [] if output_type == False: vocabulary_list = [] label_table = pd.DataFrame() for doc in range(len(corpus)): docID_list += ['doc_{}'.format(doc) for _ in range(len_voca)] document_list += [str(corpus[doc]) for _ in range(len_voca)] vocabulary_list += [voca_dict[j][0] for j in range(len_voca)] label_table['document_id'] = docID_list label_table[input_col] = document_list label_table['vocabulary'] = vocabulary_list tfidf_table = label_table tfidf_table['frequency'] = np.ravel(csr_matrix_tf.todense()) if idf_weighting_scheme == 'inverseDocumentFrequency': tfidf_table['tfidf score'] = np.ravel(csr_matrix_tfidf.todense()) elif idf_weighting_scheme == 'unary': tfidf_table['tfidf score'] = list( map(float, np.array(tfidf_table['frequency']))) elif output_type == True: for doc in range(len(corpus)): docID_list += [ 'doc_{}'.format(doc) for _ in range(csr_matrix_tfidf.indptr[doc + 1] - csr_matrix_tfidf.indptr[doc]) ] document_list += [ str(corpus[doc]) for _ in range(csr_matrix_tfidf.indptr[doc + 1] - csr_matrix_tfidf.indptr[doc]) ] tfidf_table['document_id'] = docID_list tfidf_table[input_col] = document_list tfidf_table['vocabulary'] = [ voca_dict[i][0] for i in csr_matrix_tf.indices ] tfidf_table['frequency'] = csr_matrix_tf.data data_list = [] for doc in range(len(corpus)): data_list += [ csr_matrix_tfidf.data[i] for i in range(csr_matrix_tfidf.indptr[doc + 1] - csr_matrix_tfidf.indptr[doc]) ][::-1] if idf_weighting_scheme == 'inverseDocumentFrequency': tfidf_table['tfidf score'] = data_list elif idf_weighting_scheme == 'unary': tfidf_table['tfidf score'] = list( map(float, np.array(tfidf_table['frequency']))) else: raise_runtime_error("Please check 'output_type'.") # idf table idf_table = pd.DataFrame() idf_table['vocabulary'] = [voca_dict[j][0] for j in range(len(voca_dict))] if idf_weighting_scheme == 'inverseDocumentFrequency': idf_table['idf weight'] = tfidf_vectorizer.idf_.tolist() elif idf_weighting_scheme == 'unary': idf_table['idf weight'] = float(1) params = { 'Input Column': input_col, 'Max DF': max_df, 'Min DF': min_df, 'Number of Vocabularies': num_voca, 'IDF Weighting Scheme': idf_weighting_scheme, 'Norm': norm, 'Smooth IDF': smooth_idf, 'Sublinear TF': sublinear_tf, 'Remove Zero Counts': output_type } rb = BrtcReprBuilder() rb.addMD(strip_margin("""# TF-IDF Result""")) rb.addMD( strip_margin(""" | |### Parameters | |{display_params} | |### IDF Table | |{idf_table} | |### TFIDF Table | |{tfidf_table} | """.format(display_params=dict2MD(params), idf_table=pandasDF2MD(idf_table, num_rows=200), tfidf_table=pandasDF2MD(tfidf_table, num_rows=200)))) model = _model_dict('tfidf') model['csr_matrix_tf'] = csr_matrix_tf model['csr_matrix_tfidf'] = csr_matrix_tfidf model['parameter'] = params model['idf_table'] = idf_table model['tfidf_table'] = tfidf_table model['_repr_brtc_'] = rb.get() return {'model': model}
def _decision_tree_regression_train( table, feature_cols, label_col, # fig_size=np.array([6.4, 4.8]), criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, presort=False, sample_weight=None, check_input=True, X_idx_sorted=None): param_validation_check = [ greater_than_or_equal_to(min_samples_split, 2, 'min_samples_split'), greater_than_or_equal_to(min_samples_leaf, 1, 'min_samples_leaf'), greater_than_or_equal_to(min_weight_fraction_leaf, 0.0, 'min_weight_fraction_leaf') ] if max_depth is not None: param_validation_check.append( greater_than_or_equal_to(max_depth, 1, 'max_depth')) validate(*param_validation_check) regressor = DecisionTreeRegressor(criterion, splitter, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, random_state, max_leaf_nodes, min_impurity_decrease, min_impurity_split, presort) regressor.fit(table[feature_cols], table[label_col], sample_weight, check_input, X_idx_sorted) try: from sklearn.externals.six import StringIO from sklearn.tree import export_graphviz import pydotplus dot_data = StringIO() export_graphviz(regressor, out_file=dot_data, feature_names=feature_cols, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) from brightics.common.repr import png2MD fig_tree = png2MD(graph.create_png()) except: fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer." # json model = _model_dict('decision_tree_regression_model') model['feature_cols'] = feature_cols model['label_col'] = label_col feature_importance = regressor.feature_importances_ model['feature_importance'] = feature_importance model['max_features'] = regressor.max_features_ model['n_features'] = regressor.n_features_ model['n_outputs'] = regressor.n_outputs_ model['tree'] = regressor.tree_ get_param = regressor.get_params() model['parameters'] = get_param model['regressor'] = regressor # report indices = np.argsort(feature_importance) sorted_feature_cols = np.array(feature_cols)[indices] plt.title('Feature Importances') plt.barh(range(len(indices)), feature_importance[indices], color='b', align='center') for i, v in enumerate(feature_importance[indices]): plt.text(v, i, " {:.2f}".format(v), color='b', va='center', fontweight='bold') plt.yticks(range(len(indices)), sorted_feature_cols) plt.xlabel('Relative Importance') plt.tight_layout() fig_feature_importances = plt2MD(plt) plt.clf() params = dict2MD(get_param) feature_importance_df = pd.DataFrame(data=feature_importance, index=feature_cols).T # Add tree plot rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Decision Tree Regression Train Result | ### Decision Tree | {fig_tree} | | ### Feature Importance | {fig_feature_importances} | | ### Parameters | {list_parameters} | """.format(fig_tree=fig_tree, fig_feature_importances=fig_feature_importances, list_parameters=params))) model['_repr_brtc_'] = rb.get() return {'model': model}
def _svm_classification_train(table, feature_cols, label_col, c=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=True, tol=1e-3, max_iter=-1, random_state=None, class_weight=None): _table = table.copy() feature_names, features = check_col_type(table, feature_cols) _label_col = _table[label_col] if (sklearn_utils.multiclass.type_of_target(_label_col) == 'continuous'): raise_runtime_error('''Label Column should not be continuous.''') class_labels = sorted(set(_label_col)) if class_weight is not None: if len(class_weight) != len(class_labels): raise ValueError( "Number of class weights should match number of labels.") else: class_weight = { class_labels[i]: class_weight[i] for i in range(len(class_labels)) } _svc = svm.SVC(C=c, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking, probability=probability, tol=tol, max_iter=max_iter, random_state=random_state, class_weight=class_weight) _svc_model = _svc.fit(features, _label_col) get_param = _svc.get_params() get_param['feature_cols'] = feature_names get_param['label_col'] = label_col rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## SVM Classification Result | ### Parameters | {table_parameter} """.format(table_parameter=dict2MD(get_param)))) _model = _model_dict('svc_model') _model['svc_model'] = _svc_model _model['features'] = feature_cols _model['_repr_brtc_'] = rb.get() return {'model': _model}
def _gaussian_mixture_train(table, input_cols, number_of_components=1, covariance_type='full', tolerance=0.001, \ regularize_covariance=1e-06, max_iteration=100, initial_params='kmeans', seed=None): gmm = GaussianMixture(n_components=number_of_components, covariance_type=covariance_type, tol=tolerance, \ reg_covar=regularize_covariance, max_iter=max_iteration, init_params=initial_params, random_state=seed) X_train = table[input_cols] gmm.fit(X_train) out_table = pd.DataFrame() comp_num_arr = [] for i in range(0, number_of_components): comp_num_arr.append(i) mean_arr = [] for i in range(0, number_of_components): mean_arr.append(gmm.means_[i].tolist()) covar_arr = [] for i in range(0, number_of_components): covar_arr.append(gmm.covariances_[i].tolist()) out_table['component_number'] = comp_num_arr out_table['weight'] = gmm.weights_ out_table['mean_coordinate'] = mean_arr out_table['covariance_matrix'] = covar_arr rb = BrtcReprBuilder() params = { 'Input Columns': input_cols, 'Number of Components': number_of_components, 'Covariance Type': covariance_type, 'Tolerance': tolerance, 'Regularization of Covariance': regularize_covariance, 'Number of Iteration': max_iteration, 'Method to Initialize': initial_params } rb.addMD( strip_margin(""" |## Gaussian Mixture Train Result | |### Parameters | | {params} | |### Summary | |{result_table} | """.format(params=dict2MD(params), result_table=pandasDF2MD(out_table)))) model = _model_dict('gaussian_mixture_train') model['input_cols'] = input_cols model['number_of_components'] = number_of_components model['covariance_type'] = covariance_type model['tolerance'] = tolerance model['regularize_covariance'] = regularize_covariance model['max_iteration'] = max_iteration model['initial_params'] = initial_params model['seed'] = seed model['summary'] = out_table model['gmm'] = gmm model['_repr_brtc_'] = rb.get() return {'model': model}
def _xgb_classification_train(table, feature_cols, label_col, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective='binary:logistic', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None, sample_weight_eval_set=None): validate(greater_than_or_equal_to(max_depth, 1, 'max_depth'), greater_than_or_equal_to(learning_rate, 0.0, 'learning_rate'), greater_than_or_equal_to(n_estimators, 1, 'n_estimators')) classifier = XGBClassifier(max_depth, learning_rate, n_estimators, silent, objective, booster, n_jobs, nthread, gamma, min_child_weight, max_delta_step, subsample, colsample_bytree, colsample_bylevel, reg_alpha, reg_lambda, scale_pos_weight, base_score, random_state, seed, missing) classifier.fit(table[feature_cols], table[label_col], sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set) # json get_param = classifier.get_params() feature_importance = classifier.feature_importances_ # plt.rcdefaults() plot_importance(classifier) plt.tight_layout() fig_plot_importance = plt2MD(plt) plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(classifier) # fig_plot_tree_UT = plt2MD(plt) # plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(classifier, rankdir='LR') # fig_plot_tree_LR = plt2MD(plt) # plt.rcdefaults() # plt.clf() model = _model_dict('xgb_classification_model') model['feature_cols'] = feature_cols model['label_col'] = label_col model['parameters'] = get_param model['feature_importance'] = feature_importance model['classifier'] = classifier # report # get_param_list = [] # get_param_list.append(['feature_cols', feature_cols]) # get_param_list.append(['label_col', label_col]) params = dict2MD(get_param) # for key, value in get_param.items(): # temp = [key, value] # get_param_list.append(temp) # get_param_df = pd.DataFrame(data=get_param_list, columns=['parameter', 'value']) feature_importance_df = pd.DataFrame(data=feature_importance, index=feature_cols).T rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## XGB Classification Train Result | | ### Plot Importance | {fig_importance} | | ### Feature Importance | {table_feature_importance} | | ### Parameters | {list_parameters} | """.format(fig_importance=fig_plot_importance, table_feature_importance=pandasDF2MD(feature_importance_df, 20), list_parameters=params))) model['_repr_brtc_'] = rb.get() return {'model': model}